<a href="https://colab.research.google.com/github/sriharshamutnuru/AI_Learning/blob/main/Day12_Guardrails.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# ============================================================
# 📘 Day 12 — Safety and Guardrails (Intro)
# Objective: Input validation + Output sanitization for LLM pipelines
# ============================================================

!pip install --quiet openai==1.109.1

from openai import OpenAI
from google.colab import userdata
import re
import json

# --- Step 1: Setup ---
client = OpenAI(api_key=userdata.get('OPENAI_API_KEY'))
MODEL = "gpt-4o-mini"

# --- Step 2: Define Safety Guardrails ---
banned_words = [
    "password", "confidential", "credit card", "social security", "kill", "hate"
]

pii_patterns = {
    "email": r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}",
    "phone": r"\\b\\d{3}[-.]?\\d{3}[-.]?\\d{4}\\b",
    "ssn": r"\\b\\d{3}-\\d{2}-\\d{4}\\b"
}

def validate_input(text):
    """Rejects input if it contains banned words or PII patterns."""
    for word in banned_words:
        if re.search(rf"\\b{word}\\b", text, re.IGNORECASE):
            return False, f"❌ Input rejected: contains banned word '{word}'."

    for key, pattern in pii_patterns.items():
        if re.search(pattern, text):
            return False, f"❌ Input rejected: contains potential {key}."

    return True, "✅ Input passed validation."

def sanitize_output(output_text):
    """Redacts PII and banned words from model output."""
    sanitized = output_text
    for key, pattern in pii_patterns.items():
        sanitized = re.sub(pattern, f"[REDACTED_{key.upper()}]", sanitized)
    for word in banned_words:
        sanitized = re.sub(rf"\\b{word}\\b", "[REDACTED_WORD]", sanitized, flags=re.IGNORECASE)
    return sanitized

# --- Step 3: Prompt Function ---
def get_model_output(context):
    prompt = f"""
    You are an intelligent assistant.
    Extract the main problem and its solution from this text.
    Return valid JSON with keys 'problem' and 'solution'.
    Text: {context}
    """

    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are a safe and structured LLM."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3,
    )
    return response.choices[0].message.content.strip()

# --- Step 4: Test Scenarios ---
test_inputs = [
    "Azure backup failed because the user's password expired.",
    "Data ingestion job failed due to schema mismatch between tables.",
    "Customer email is john.doe@gmail.com and backup job failed due to timeout.",
]

results = []
for text in test_inputs:
    print("\n==============================")
    print(f"🧩 Input: {text}")
    is_valid, message = validate_input(text)
    print(message)

    if is_valid:
        output = get_model_output(text)
        clean_output = sanitize_output(output)
        print("\n🧾 Model Output (Sanitized):\n", clean_output)
    else:
        print("⛔ Model call skipped for unsafe input.")

print("\n✅ Guardrails validation complete.")



🧩 Input: Azure backup failed because the user's password expired.
✅ Input passed validation.

🧾 Model Output (Sanitized):
 ```json
{
  "problem": "Azure backup failed due to the user's password expiration.",
  "solution": "Update the user's password to resolve the issue."
}
```

🧩 Input: Data ingestion job failed due to schema mismatch between tables.
✅ Input passed validation.

🧾 Model Output (Sanitized):
 {
    "problem": "Data ingestion job failed due to schema mismatch between tables.",
    "solution": "Resolve the schema mismatch between the tables."
}

🧩 Input: Customer email is john.doe@gmail.com and backup job failed due to timeout.
✅ Input passed validation.

🧾 Model Output (Sanitized):
 {
  "problem": "Backup job failed due to timeout.",
  "solution": "Investigate the cause of the timeout and implement measures to prevent it from occurring in future backup jobs."
}

✅ Guardrails validation complete.
