In [2]:
#Installing required libraries

!pip install --quiet openai jsonschema
print("Installed: openai, jsonschema")


Installed: openai, jsonschema


In [6]:
%env GROQ_API_KEY=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

env: GROQ_API_KEY=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


In [8]:
#Imports and client setup

import os
import json
import re
from typing import List, Dict, Any


try:
    from openai import OpenAI
except Exception as e:
    raise RuntimeError("openai SDK import failed; ensure pip install succeeded.") from e

GROQ_KEY = os.getenv("GROQ_API_KEY")
if not GROQ_KEY:
    print("WARNING: GROQ_API_KEY not set. The notebook will run using local fallbacks (no network calls).")
# Created client object even if key is None — API calls will fail gracefully if key missing.
client = OpenAI(api_key=GROQ_KEY, base_url="https://api.groq.com/openai/v1")


SUMMARIZATION_MODEL = "gpt-4o-mini"         # placeholder
FUNCTION_CALL_MODEL = "gpt-4o-mini"        # placeholder for function-calling model


In [9]:
# Task 1:  ConversationManager with truncation + periodic summarization

class ConversationManager:
    """
    ConversationManager stores a list of messages: [{'role': 'user'|'assistant'|'system', 'content': ...}, ...].
    It supports:
      - append_message(role, content)
      - truncate_by_turns(n)
      - truncate_by_ject even if key is None — API calls will fail gracefully if key missing.
client = OpenAI(api_key=GROQ_KEY, base_url="https://api.groq.com/openai/v1")


SUMMARIZATION_MODEL = "gpt-4o-mini"         # placeholder
FUNCTION_CALL_MODEL = "gpt-4o-mini"        # placeholder for function-calling model
chars(max_chars)
      - periodic summarization: after every k appended messages, call the summarizer and replace history with summary + final user message
    """

    def __init__(self, summarize_every_k: int = 3, summarization_model: str = SUMMARIZATION_MODEL, summary_max_tokens: int = 150):
        self.history: List[Dict[str, str]] = []
        self.append_count = 0
        self.summarize_every_k = summarize_every_k
        self.summarization_model = summarization_model
        self.summary_max_tokens = summary_max_tokens

    def append_message(self, role: str, content: str, call_api_client: Any = client):
        """Append a message to history and trigger periodic summarization if configured."""
        self.history.append({"role": role, "content": content})
        self.append_count += 1

        # If k-th run reached, summarize (use the client passed or global client)
        if self.summarize_every_k > 0 and (self.append_count % self.summarize_every_k) == 0:
            self.summarize_history(call_api_client)

    def truncate_by_turns(self, last_n: int):
        """Keep only the last `last_n` messages (turns)."""
        if last_n <= 0:
            self.history = []
            return
        self.history = self.history[-last_n:]

    def truncate_by_chars(self, max_chars: int):
        """Keep messages from the end until the combined length is <= max_chars."""
        if max_chars <= 0:
            self.history = []
            return
        kept = []
        total = 0
        for msg in reversed(self.history):
            if total + len(msg["content"]) <= max_chars:
                kept.insert(0, msg)  # add to front to preserve order
                total += len(msg["content"])
            else:
                break
        self.history = kept

    def _build_summarization_prompt(self) -> str:
        """Create a single text prompt from the conversation for summarization."""
        parts = []
        for m in self.history:
            parts.append(f"{m['role'].upper()}: {m['content']}")
        return "Summarize the conversation below concisely, preserving intents, decisions and requested items:\n\n" + "\n\n".join(parts)

    def summarize_history(self, api_client):
        """
        Ask the model to summarize the full history. If API key is missing or call fails, use a local fallback summary.
        After summarization, replace history with a single system summary message and keep the last user message if any.
        """
        prompt = self._build_summarization_prompt()

        # If no API key available, do a local fallback summary (short concatenation)
        if not GROQ_KEY:
            joined = " ".join(m["content"] for m in self.history)
            # simple fallback: take first 300 chars (we can improve with an extractive algorithm)
            short = joined[:300] + ("..." if len(joined) > 300 else "")
            summary_text = f"(LOCAL-FALLBACK) {short}"
            print("⚠️ No API key: using local fallback summarization.")
        else:
            try:
                # Call Groq/OpenAI-compatible chat completion for summarization
                resp = api_client.chat.completions.create(
                    model=self.summarization_model,
                    messages=[
                        {"role": "system", "content": "You are a concise summarizer."},
                        {"role": "user", "content": prompt}
                    ],
                    max_tokens=self.summary_max_tokens,
                )
                # Robust extraction of text from different response shapes:
                try:
                    summary_text = resp.choices[0].message.content
                except Exception:
                    try:
                        summary_text = resp.choices[0].text
                    except Exception:
                        summary_text = str(resp)
                print("🔹 Summarization API called successfully.")
            except Exception as e:
                # If API call fails, fallback to local short summary
                joined = " ".join(m["content"] for m in self.history)
                short = joined[:300] + ("..." if len(joined) > 300 else "")
                summary_text = f"(API-ERROR-FALLBACK) {short}"
                print("⚠️ Summarization API call failed; using fallback. Error:", str(e))

        # Replace history with summary (system role) and keep last user message if present
        summary_obj = {"role": "system", "content": f"[SUMMARY] {summary_text}"}
        last_user = None
        for m in reversed(self.history):
            if m["role"] == "user":
                last_user = m
                break
        if last_user:
            self.history = [summary_obj, last_user]
        else:
            self.history = [summary_obj]

    def show_history(self):
        """Return a readable copy of the history for printing."""
        return [{"role": m["role"], "content": m["content"]} for m in self.history]

# End of ConversationManager


In [10]:
# Demo Task 1: feed samples and print outputs after each step


cm = ConversationManager(summarize_every_k=3)  # summary after every 3 appended messages

# Example conversation flow (we can add more)
samples = [
    ("user", "Hi, I want to cancel my TV subscription and request refund."),
    ("assistant", "I can help. Could you provide your account email?"),
    ("user", "Yes, alice@example.com. I paid last week."),
    ("assistant", "Thanks — do you want a full refund or pro-rated?"),
    ("user", "Full refund, please. My account number is 12345."),
    ("assistant", "Understood. I'll start the cancellation; do you want to keep any saved settings?"),
    ("user", "No, please delete everything."),
]

# Append messages and show state after each append (demonstrates k-th summarization)
for i, (role, text) in enumerate(samples, 1):
    print(f"\n--- Append #{i}: {role} -> {text}")
    cm.append_message(role, text, call_api_client=client)
    print("Current history (short):")
    for msg in cm.show_history():
        snippet = msg["content"] if len(msg["content"]) <= 140 else msg["content"][:137] + "..."
        print(f"  [{msg['role']}] {snippet}")

# Show truncation examples
print("\n=== Truncation examples ===")
print("Original history length:", len(cm.history))
cm.truncate_by_turns(2)
print("After truncate_by_turns(2):", cm.show_history())

# Reset and re-run to show truncate_by_chars example
cm = ConversationManager(summarize_every_k=100)  # disable auto summary for demonstration
for role, text in samples:
    cm.append_message(role, text)
print("\nFull history character length:", sum(len(m['content']) for m in cm.history))
cm.truncate_by_chars(100)
print("After truncate_by_chars(100):", cm.show_history())



--- Append #1: user -> Hi, I want to cancel my TV subscription and request refund.
Current history (short):
  [user] Hi, I want to cancel my TV subscription and request refund.

--- Append #2: assistant -> I can help. Could you provide your account email?
Current history (short):
  [user] Hi, I want to cancel my TV subscription and request refund.
  [assistant] I can help. Could you provide your account email?

--- Append #3: user -> Yes, alice@example.com. I paid last week.
⚠️ Summarization API call failed; using fallback. Error: Error code: 404 - {'error': {'message': 'The model `gpt-4o-mini` does not exist or you do not have access to it.', 'type': 'invalid_request_error', 'code': 'model_not_found'}}
Current history (short):
  [system] [SUMMARY] (API-ERROR-FALLBACK) Hi, I want to cancel my TV subscription and request refund. I can help. Could you provide your account ema...
  [user] Yes, alice@example.com. I paid last week.

--- Append #4: assistant -> Thanks — do you want a full r

In [19]:
# ==========================
# Task 2: JSON Schema Extraction & Validation
# ==========================

import re
import json
from typing import Dict, Any
from jsonschema import validate, ValidationError

# --------------------------
# JSON Schema Definition
# --------------------------
extract_schema = {
    "type": "object",
    "properties": {
        "name": {"type": "string"},
        "email": {"type": "string"},     # Optional: add 'format': 'email'
        "phone": {"type": "string"},
        "location": {"type": "string"},
        "age": {"type": ["integer", "null"], "minimum": 0}  # allow null if unknown
    },
    "required": ["name", "email", "phone", "location", "age"]
}

# --------------------------
# Fallback Regex Extractor
# --------------------------
def fallback_extract(chat_text: str) -> Dict[str, Any]:
    """
    Extract information from chat using regex heuristics.
    Returns a dict matching the JSON schema.
    """
    # Name extraction (multi or single word)
    name_match = re.search(
        r"(?:I'm|I am|I’m|This is|Hey, it's|Hello, this is)\s+([A-Z][a-z]+(?:\s[A-Z][a-z]+)?)",
        chat_text
    )

    # Email extraction
    email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', chat_text)

    # Phone number extraction (+91 optional)
    phone_match = re.search(r'(\+91[-\s]?\d{10}|\b\d{10}\b)', chat_text)

    # Location extraction
    loc_match = re.search(r'(?:live in|from|at|Location[:\-]?)\s*([A-Z][\w\s\-]{2,30})', chat_text, re.IGNORECASE)

    # Age extraction (1–2 digit numbers near keywords)
    age_match = re.search(
        r"(?:Age[: ]*|I'?m\s+|I am\s+)?\b(\d{1,2})\b(?=\s?(?:years|yrs|yo|$|\.))",
        chat_text
    )

    extracted = {
        "name": name_match.group(1).strip() if name_match else "Unknown",
        "email": email_match.group(0).rstrip(".") if email_match else "Unknown",
        "phone": phone_match.group(0) if phone_match else "Unknown",
        "location": loc_match.group(1).strip() if loc_match else "Unknown",
        "age": int(age_match.group(1)) if age_match else None
    }

    return extracted

# --------------------------
# API Extraction Wrapper
# --------------------------
def extract_info(chat_text: str, api_client=None, model: str = "gpt-4o-mini") -> Dict[str, Any]:
    """
    Extract info via API if available; otherwise use fallback.
    """
    if api_client is None:
        # No API client, use fallback
        return fallback_extract(chat_text)

    # Try calling the API
    try:
        functions = [
            {
                "name": "extract_info",
                "description": "Extract contact info from chat as JSON.",
                "parameters": extract_schema
            }
        ]

        resp = api_client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": chat_text}],
            functions=functions,
            function_call={"name": "extract_info"}
        )

        # Extract function arguments
        func_args = resp.choices[0].message.get("function_call", {}).get("arguments", "{}")
        parsed = json.loads(func_args) if isinstance(func_args, str) else func_args
        return parsed

    except Exception as e:
        # On API failure, fallback
        print("⚠️ API function calling failed — using fallback extractor. Error:", e)
        return fallback_extract(chat_text)

# --------------------------
# Validation Function
# --------------------------
def validate_extracted(obj: Dict[str, Any], schema: Dict[str, Any]) -> None:
    """
    Validate extracted dictionary against JSON schema and print results.
    """
    try:
        validate(instance=obj, schema=schema)
        print(f"Extracted: {obj}")
        print("✅ Valid according to schema\n")
    except ValidationError as ve:
        print(f"Extracted: {obj}")
        print(f"❌ Not valid: {ve.message}\n")

# --------------------------
# DEMO RUN
# --------------------------
sample_chats = [
    "Hi, I'm Rajiv Sharma. My email is rajiv.sharma@example.com and I'm 24. I live in Pune. My phone is +91-9876543210.",
    "Hello, this is Priya. You can reach me at priya_contact@gmail.com. Age: 30. Location: Bengaluru.",
    "Hey, it's Ankit — ankit123@mail.com. Phone 9998887776."
]

for chat in sample_chats:
    print("---- CHAT ----")
    print(chat)
    extracted = fallback_extract(chat)
    validate_extracted(extracted, extract_schema)


---- CHAT ----
Hi, I'm Rajiv Sharma. My email is rajiv.sharma@example.com and I'm 24. I live in Pune. My phone is +91-9876543210.
Extracted: {'name': 'Rajiv Sharma', 'email': 'rajiv.sharma@example.com', 'phone': '+91-9876543210', 'location': 'Pune', 'age': 24}
✅ Valid according to schema

---- CHAT ----
Hello, this is Priya. You can reach me at priya_contact@gmail.com. Age: 30. Location: Bengaluru.
Extracted: {'name': 'Priya', 'email': 'priya_contact@gmail.com', 'phone': 'Unknown', 'location': 'priya_contact', 'age': 30}
✅ Valid according to schema

---- CHAT ----
Hey, it's Ankit — ankit123@mail.com. Phone 9998887776.
Extracted: {'name': 'Ankit', 'email': 'ankit123@mail.com', 'phone': '9998887776', 'location': 'Unknown', 'age': None}
✅ Valid according to schema



In [20]:
import re
import jsonschema

# JSON schema definition
schema = {
    "type": "object",
    "properties": {
        "name": {"type": "string"},
        "email": {"type": "string"},
        "phone": {"type": "string"},
        "location": {"type": "string"},
        "age": {"type": "integer", "minimum": 0}
    },
    "required": ["name", "email", "phone", "location", "age"]
}

def extract_fields_from_text(text):
    """Extract structured info from free-form chat using regex + defaults."""

    # Regex patterns
    name_match = re.search(r"(?:I'm|I am|This is|this is|My name is)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)", text)
    email_match = re.search(r"[\w\.-]+@[\w\.-]+\.\w+", text)
    phone_match = re.search(r"(?:\+91[- ]?)?\d{10}", text)
    location_match = re.search(r"(?:live in|from|Location[: ]+)\s*([A-Z][a-zA-Z]+)", text)
    age_match = re.search(r"(?:Age[: ]*|I'?m\s+|I am\s+)?\b(\d{1,2})\b(?=\s?(?:years|yrs|yo|$|\.))", text)

    # Extract values or assign defaults
    name = name_match.group(1) if name_match else "Unknown"
    email = email_match.group(0).rstrip(".") if email_match else "unknown@example.com"
    phone = phone_match.group(0) if phone_match else "Unknown"
    location = location_match.group(1) if location_match else "Unknown"
    age = int(age_match.group(1)) if age_match else 0

    # Build dictionary
    extracted = {
        "name": name,
        "email": email,
        "phone": phone,
        "location": location,
        "age": age
    }

    # Validate against schema
    try:
        jsonschema.validate(instance=extracted, schema=schema)
        print(f"Extracted: {extracted}")
        print("✅ Valid according to schema\n")
    except jsonschema.ValidationError as e:
        print(f"Extracted: {extracted}")
        print(f"❌ Not valid: {e.message}\n")

    return extracted


# === DEMO RUN ===
sample_chats = [
    "Hi, I'm Rajiv Sharma. My email is rajiv.sharma@example.com and I'm 24. I live in Pune. My phone is +91-9876543210.",
    "Hello, this is Priya. You can reach me at priya_contact@gmail.com. Age: 30. Location: Bengaluru.",
    "Hey, it's Ankit — ankit123@mail.com. Phone 9998887776."
]

for chat in sample_chats:
    print("---- CHAT ----")
    print(chat)
    extract_fields_from_text(chat)


---- CHAT ----
Hi, I'm Rajiv Sharma. My email is rajiv.sharma@example.com and I'm 24. I live in Pune. My phone is +91-9876543210.
Extracted: {'name': 'Rajiv Sharma', 'email': 'rajiv.sharma@example.com', 'phone': '+91-9876543210', 'location': 'Pune', 'age': 24}
✅ Valid according to schema

---- CHAT ----
Hello, this is Priya. You can reach me at priya_contact@gmail.com. Age: 30. Location: Bengaluru.
Extracted: {'name': 'Priya', 'email': 'priya_contact@gmail.com', 'phone': 'Unknown', 'location': 'Bengaluru', 'age': 30}
✅ Valid according to schema

---- CHAT ----
Hey, it's Ankit — ankit123@mail.com. Phone 9998887776.
Extracted: {'name': 'Unknown', 'email': 'ankit123@mail.com', 'phone': '9998887776', 'location': 'Unknown', 'age': 0}
✅ Valid according to schema

