<a href="https://colab.research.google.com/github/sriharshamutnuru/AI_Learning/blob/main/Day14_Structured_Extraction_Pipeline_(End_to_End_Project).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ============================================================
# 📘 Day 14 — Structured Extraction Pipeline (End-to-End Project)
# ============================================================

!pip install --quiet openai==1.109.1 jsonschema pandas

from openai import OpenAI
from google.colab import userdata
import re, json
import pandas as pd
from jsonschema import validate, ValidationError

# --- Step 1: Setup ---
client = OpenAI(api_key=userdata.get('OPENAI_API_KEY'))
MODEL = "gpt-4o-mini"

# --- Step 2: Safety Guardrails ---
banned_words = ["password", "confidential", "credit card"]
pii_patterns = {
    "email": r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}",
    "phone": r"\\b\\d{3}[-.]?\\d{3}[-.]?\\d{4}\\b",
}

def validate_input(text):
    for word in banned_words:
        if re.search(rf"\\b{word}\\b", text, re.IGNORECASE):
            return False, f"❌ Contains banned word '{word}'."
    for k, p in pii_patterns.items():
        if re.search(p, text):
            return False, f"❌ Contains potential {k}."
    return True, "✅ Input clean."

def sanitize_output(text):
    for k, p in pii_patterns.items():
        text = re.sub(p, f"[REDACTED_{k.upper()}]", text)
    for word in banned_words:
        text = re.sub(rf"\\b{word}\\b", "[REDACTED]", text, flags=re.IGNORECASE)
    return text

# --- Step 3: Prompt Template ---
def build_prompt(document):
    return f"""
You are a structured data extractor.

Task:
Extract key information from the following document in JSON with keys:
"title", "summary", "keywords", "category"

Document:
\"\"\"{document}\"\"\"

Return ONLY valid JSON.
"""

# --- Step 4: LLM Call ---
def extract_structured_data(document, temperature=0.0, top_p=0.9):
    prompt = build_prompt(document)
    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are a precise and safe JSON extractor."},
            {"role": "user", "content": prompt},
        ],
        temperature=temperature,
        top_p=top_p,
        max_tokens=300
    )
    output = response.choices[0].message.content.strip()
    return sanitize_output(output)

# --- Step 5: Schema Validation ---
schema = {
    "type": "object",
    "properties": {
        "title": {"type": "string"},
        "summary": {"type": "string"},
        "keywords": {"type": "array"},
        "category": {"type": "string"}
    },
    "required": ["title", "summary", "keywords", "category"]
}

def validate_json(output):
    try:
        data = json.loads(output)
        validate(instance=data, schema=schema)
        return True, data
    except (json.JSONDecodeError, ValidationError) as e:
        return False, str(e)

# --- Step 6: Run Pipeline ---
document = """
Microsoft Azure provides cloud services that help businesses manage data, deploy AI models,
and scale applications efficiently. It also ensures security and compliance across environments.
"""

valid, message = validate_input(document)
if not valid:
    print(message)
else:
    print("✅ Input validated.")
    output = extract_structured_data(document)
    print("\n🧠 Model Output:\n", output)

    is_valid, result = validate_json(output)
    if is_valid:
        print("\n✅ JSON Schema Validated.")
        df = pd.DataFrame([result])
        display(df)
        df.to_csv("structured_output.csv", index=False)
        print("\n📁 Saved: structured_output.csv")
    else:
        print("\n❌ JSON Schema Error:\n", result)

print("\n🏁 Structured Extraction Pipeline complete.")


✅ Input validated.

🧠 Model Output:
 {
  "title": "Microsoft Azure Cloud Services",
  "summary": "Microsoft Azure provides cloud services that help businesses manage data, deploy AI models, and scale applications efficiently. It also ensures security and compliance across environments.",
  "keywords": ["cloud services", "data management", "AI models", "application scaling", "security", "compliance"],
  "category": "Cloud Computing"
}

✅ JSON Schema Validated.


Unnamed: 0,title,summary,keywords,category
0,Microsoft Azure Cloud Services,Microsoft Azure provides cloud services that h...,"[cloud services, data management, AI models, a...",Cloud Computing



📁 Saved: structured_output.csv

🏁 Structured Extraction Pipeline complete.
