# Lab 2: Sensitive Data Detection & AI Cataloguing - SOLUTIONS

**Data Discovery: Harnessing AI, AGI & Vector Databases - Day 2**

| Duration | Difficulty | Framework | Exercises |
|---|---|---|---|
| 90 min | Intermediate | pandas, re, spacy, scikit-learn, chromadb, matplotlib | 5 |

In [None]:
# Core libraries
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from collections import Counter

# NLP
import spacy

# ML & Vector DB
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import chromadb

# Settings
%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

print("Libraries loaded successfully!")

In [None]:
np.random.seed(42)

first_names = ['John', 'Jane', 'Robert', 'Maria', 'David', 'Sarah', 'Michael', 'Emily', 'James', 'Lisa']
last_names = ['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller', 'Davis', 'Rodriguez', 'Wilson']
companies = ['Acme Corp', 'GlobalTech', 'MedPlus Health', 'FinanceFirst', 'DataDriven Inc']
cities = ['New York', 'San Francisco', 'Chicago', 'Boston', 'Seattle', 'Austin', 'Denver', 'Atlanta']

def random_ssn():
    return f"{np.random.randint(100,999)}-{np.random.randint(10,99)}-{np.random.randint(1000,9999)}"

def random_cc():
    return f"{np.random.randint(4000,4999)}-{np.random.randint(1000,9999)}-{np.random.randint(1000,9999)}-{np.random.randint(1000,9999)}"

def random_email(first, last):
    domains = ['company.com', 'email.org', 'corp.net', 'enterprise.io']
    return f"{first.lower()}.{last.lower()}@{np.random.choice(domains)}"

def random_phone():
    return f"({np.random.randint(200,999)}) {np.random.randint(200,999)}-{np.random.randint(1000,9999)}"

templates = {
    'hr_memo': [
        "Employee {name} (SSN: {ssn}) has been promoted to Senior Analyst effective March 2024. Contact: {email}, Phone: {phone}. Based in {city}.",
        "Termination notice for {name}, SSN: {ssn}. Final paycheck to be sent to address on file. HR contact: {email}. Processed by {company}.",
        "{name} from {company} submitted a leave request. Employee ID: EMP-{emp_id}. Emergency contact phone: {phone}. Location: {city}.",
        "Salary adjustment memo: {name} (SSN: {ssn}) annual compensation increased to ${salary:,}. Effective date: January 2024. Department: {company}.",
    ],
    'financial_report': [
        "Invoice #INV-{inv_id} for {company}: Payment of ${amount:,.2f} via credit card {cc}. Approved by {name}. Contact: {email}.",
        "Expense report submitted by {name} ({email}) for ${amount:,.2f}. Corporate card ending {cc_last4}. Reimbursement approved by finance team at {company}.",
        "Quarterly financial summary for {company}: Revenue ${amount:,.2f}. Prepared by {name}, CFO. Confidential. Phone: {phone}.",
        "Wire transfer confirmation: ${amount:,.2f} sent to account ending {acct_last4} for {name} at {company}. Reference: TXN-{txn_id}.",
    ],
    'medical_form': [
        "Patient: {name}, DOB: {dob}, SSN: {ssn}. Diagnosis: Type 2 Diabetes. Prescribed Metformin 500mg. Dr. {doctor} at {city} Medical Center.",
        "Insurance claim for {name} (Member ID: MED-{med_id}). Procedure: Annual physical exam. Provider: {company} Health. Phone: {phone}.",
        "Medical records request for {name}, DOB: {dob}. Records to be sent to {doctor} at {city} General Hospital. Patient email: {email}.",
    ],
    'marketing_data': [
        "Campaign analytics report for {company}: {impressions:,} impressions, {clicks:,} clicks, {conversions} conversions. Manager: {name}, {email}.",
        "Customer profile: {name}, {city}. Purchase history includes {purchases} orders. Email: {email}. Phone: {phone}. Loyalty tier: Gold.",
        "Event registration: {name} from {company} registered for AI Summit 2024 in {city}. Contact: {email}. Dietary: vegetarian.",
    ],
    'legal_document': [
        "Non-disclosure agreement between {name} and {company}. Effective date: January 2024. Jurisdiction: {city}. Contact: {email}.",
        "Data processing agreement: {company} processes personal data of EU residents per GDPR Art. 28. DPO: {name}, {email}, {phone}.",
        "Contract #CTR-{ctr_id} between {name} and {company}. Value: ${amount:,.2f}. Signed in {city}. Witness: {witness}.",
    ],
}

documents = []
for i in range(200):
    doc_type = np.random.choice(list(templates.keys()))
    template = np.random.choice(templates[doc_type])
    first = np.random.choice(first_names)
    last = np.random.choice(last_names)
    name = f"{first} {last}"
    
    doc_text = template.format(
        name=name,
        ssn=random_ssn(),
        cc=random_cc(),
        cc_last4=f"{np.random.randint(1000,9999)}",
        email=random_email(first, last),
        phone=random_phone(),
        city=np.random.choice(cities),
        company=np.random.choice(companies),
        salary=np.random.randint(50000, 200000),
        amount=np.random.uniform(100, 500000),
        emp_id=np.random.randint(10000, 99999),
        inv_id=np.random.randint(10000, 99999),
        txn_id=np.random.randint(100000, 999999),
        acct_last4=f"{np.random.randint(1000,9999)}",
        med_id=np.random.randint(100000, 999999),
        dob=f"{np.random.randint(1,12):02d}/{np.random.randint(1,28):02d}/{np.random.randint(1950,2000)}",
        doctor=f"Dr. {np.random.choice(last_names)}",
        impressions=np.random.randint(10000, 1000000),
        clicks=np.random.randint(100, 50000),
        conversions=np.random.randint(10, 1000),
        purchases=np.random.randint(1, 50),
        ctr_id=np.random.randint(10000, 99999),
        witness=f"{np.random.choice(first_names)} {np.random.choice(last_names)}",
    )
    
    documents.append({
        'doc_id': f'DOC-{i+1:04d}',
        'doc_type': doc_type,
        'text': doc_text,
        'department': doc_type.replace('_', ' ').title().split()[0],
    })

docs_df = pd.DataFrame(documents)
print(f"Generated {len(docs_df)} documents")
print(f"\nDocument type distribution:")
print(docs_df['doc_type'].value_counts())
print(f"\nSample document:")
print(docs_df.iloc[0]['text'])

## Exercise 1.1: Regex PII Scanning - SOLUTION

In [None]:
def scan_pii_regex(text):
    """Scan text for PII using regex patterns."""
    patterns = {
        'ssn': re.compile(r'\b\d{3}-\d{2}-\d{4}\b'),
        'credit_card': re.compile(r'\b\d{4}-\d{4}-\d{4}-\d{4}\b'),
        'email': re.compile(r'\b[\w.+-]+@[\w-]+\.[\w.]+\b'),
        'phone': re.compile(r'\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}'),
    }
    findings = {}
    for name, pat in patterns.items():
        matches = pat.findall(text)
        if matches:
            findings[name] = matches
    return findings

# Apply to all documents
docs_df['pii_findings'] = docs_df['text'].apply(scan_pii_regex)
docs_df['ssn_count'] = docs_df['pii_findings'].apply(lambda x: len(x.get('ssn', [])))
docs_df['cc_count'] = docs_df['pii_findings'].apply(lambda x: len(x.get('credit_card', [])))
docs_df['email_count'] = docs_df['pii_findings'].apply(lambda x: len(x.get('email', [])))
docs_df['phone_count'] = docs_df['pii_findings'].apply(lambda x: len(x.get('phone', [])))
docs_df['total_pii'] = docs_df['ssn_count'] + docs_df['cc_count'] + docs_df['email_count'] + docs_df['phone_count']

print("PII Detection Summary:")
print(f"  Documents with SSN:         {(docs_df['ssn_count'] > 0).sum()}")
print(f"  Documents with Credit Card: {(docs_df['cc_count'] > 0).sum()}")
print(f"  Documents with Email:       {(docs_df['email_count'] > 0).sum()}")
print(f"  Documents with Phone:       {(docs_df['phone_count'] > 0).sum()}")
print(f"  Documents with any PII:     {(docs_df['total_pii'] > 0).sum()}")

## Exercise 1.2: NER with spaCy - SOLUTION

In [None]:
def extract_ner_entities(text):
    """Extract named entities using spaCy."""
    doc = nlp(text)
    entities = {'PERSON': [], 'ORG': [], 'GPE': []}
    for ent in doc.ents:
        if ent.label_ in entities:
            entities[ent.label_].append(ent.text)
    return entities

# Apply to all documents
docs_df['ner_entities'] = docs_df['text'].apply(extract_ner_entities)
docs_df['person_count'] = docs_df['ner_entities'].apply(lambda x: len(x['PERSON']))
docs_df['org_count'] = docs_df['ner_entities'].apply(lambda x: len(x['ORG']))
docs_df['gpe_count'] = docs_df['ner_entities'].apply(lambda x: len(x['GPE']))

# Combined PII types count
def count_pii_types(row):
    types = 0
    if row['ssn_count'] > 0: types += 1
    if row['cc_count'] > 0: types += 1
    if row['email_count'] > 0: types += 1
    if row['phone_count'] > 0: types += 1
    if row['person_count'] > 0: types += 1
    if row['org_count'] > 0: types += 1
    if row['gpe_count'] > 0: types += 1
    return types

docs_df['total_pii_types'] = docs_df.apply(count_pii_types, axis=1)

print("NER Detection Summary:")
print(f"  Documents with PERSON:  {(docs_df['person_count'] > 0).sum()}")
print(f"  Documents with ORG:     {(docs_df['org_count'] > 0).sum()}")
print(f"  Documents with GPE:     {(docs_df['gpe_count'] > 0).sum()}")
print(f"\nHybrid Detection (regex + NER):")
print(f"  Avg PII types per doc:  {docs_df['total_pii_types'].mean():.1f}")
print(f"  Max PII types in a doc: {docs_df['total_pii_types'].max()}")

## Exercise 2.1: Risk Scoring - SOLUTION

In [None]:
def compute_risk_score(row):
    """Compute a 0-100 risk score for a document."""
    score = 0
    
    # PII type scores
    if row['ssn_count'] > 0:
        score += 30
    if row['cc_count'] > 0:
        score += 25
    if row['email_count'] > 0:
        score += 10
    if row['phone_count'] > 0:
        score += 10
    
    # NER entity scores
    score += min(row['person_count'] * 5, 15)
    
    # Document type bonus
    if row['doc_type'] == 'medical_form':
        score += 15
    elif row['doc_type'] == 'financial_report':
        score += 10
    
    return min(score, 100)

# Apply risk scoring
docs_df['risk_score'] = docs_df.apply(compute_risk_score, axis=1)

# Assign risk tiers
def assign_tier(score):
    if score >= 76:
        return 'Critical'
    elif score >= 51:
        return 'High'
    elif score >= 26:
        return 'Medium'
    else:
        return 'Low'

docs_df['risk_tier'] = docs_df['risk_score'].apply(assign_tier)

print("Risk Tier Distribution:")
print(docs_df['risk_tier'].value_counts())
print(f"\nAverage risk score: {docs_df['risk_score'].mean():.1f}")
print(f"Median risk score:  {docs_df['risk_score'].median():.1f}")

## Exercise 2.2: Compliance Dashboard - SOLUTION

In [None]:
def build_compliance_dashboard(docs_df):
    """Build a 2x2 compliance dashboard."""
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))

    # 1. PII type distribution
    pii_counts = {
        'SSN': (docs_df['ssn_count'] > 0).sum(),
        'Credit Card': (docs_df['cc_count'] > 0).sum(),
        'Email': (docs_df['email_count'] > 0).sum(),
        'Phone': (docs_df['phone_count'] > 0).sum(),
        'Person (NER)': (docs_df['person_count'] > 0).sum(),
        'Org (NER)': (docs_df['org_count'] > 0).sum(),
        'Location (NER)': (docs_df['gpe_count'] > 0).sum(),
    }
    colors = ['#ef4444', '#f59e0b', '#3b82f6', '#10b981', '#8b5cf6', '#ec4899', '#06b6d4']
    axes[0, 0].barh(list(pii_counts.keys()), list(pii_counts.values()), color=colors)
    axes[0, 0].set_title('PII Type Distribution (docs containing each type)', fontsize=12)
    axes[0, 0].set_xlabel('Number of Documents')

    # 2. Risk tier distribution
    tier_counts = docs_df['risk_tier'].value_counts()
    tier_order = ['Critical', 'High', 'Medium', 'Low']
    tier_colors = ['#ef4444', '#f59e0b', '#3b82f6', '#10b981']
    tier_vals = [tier_counts.get(t, 0) for t in tier_order]
    axes[0, 1].pie(tier_vals, labels=tier_order, colors=tier_colors,
                   autopct='%1.1f%%', startangle=90)
    axes[0, 1].set_title('Risk Tier Distribution', fontsize=12)

    # 3. Average risk by doc type
    avg_risk = docs_df.groupby('doc_type')['risk_score'].mean().sort_values()
    avg_risk.plot(kind='barh', ax=axes[1, 0], color='#8b5cf6')
    axes[1, 0].set_title('Average Risk Score by Document Type', fontsize=12)
    axes[1, 0].set_xlabel('Average Risk Score')

    # 4. Regulation applicability
    regulations = {
        'GDPR': (docs_df['person_count'] > 0).sum(),
        'HIPAA': (docs_df['doc_type'] == 'medical_form').sum(),
        'PCI-DSS': (docs_df['cc_count'] > 0).sum(),
        'CCPA': ((docs_df['email_count'] > 0) & (docs_df['phone_count'] > 0)).sum(),
    }
    axes[1, 1].bar(regulations.keys(), regulations.values(),
                   color=['#3b82f6', '#10b981', '#f59e0b', '#ef4444'])
    axes[1, 1].set_title('Documents Subject to Regulation', fontsize=12)
    axes[1, 1].set_ylabel('Number of Documents')

    plt.suptitle('Compliance Dashboard', fontsize=16, fontweight='bold', y=1.02)
    plt.tight_layout()
    plt.show()

build_compliance_dashboard(docs_df)

## Exercise 2.3: Catalogue Integration - SOLUTION

In [None]:
def build_risk_catalogue(docs_df):
    """Build a vector catalogue with risk metadata."""
    # Load model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Encode texts
    texts = docs_df['text'].tolist()
    embeddings = model.encode(texts)

    # Create ChromaDB collection
    client = chromadb.Client()
    collection = client.create_collection("risk_catalogue")

    # Add with metadata
    collection.add(
        embeddings=embeddings.tolist(),
        documents=texts,
        ids=docs_df['doc_id'].tolist(),
        metadatas=[
            {
                'doc_type': row['doc_type'],
                'risk_score': int(row['risk_score']),
                'risk_tier': row['risk_tier'],
            }
            for _, row in docs_df.iterrows()
        ]
    )

    print(f"Risk catalogue built with {collection.count()} documents")
    return collection, model

collection, model = build_risk_catalogue(docs_df)

In [None]:
def filtered_risk_search(collection, query, risk_tier=None, n_results=5):
    """Search the risk catalogue with optional risk tier filter."""
    where_clause = None
    if risk_tier:
        where_clause = {'risk_tier': risk_tier}

    results = collection.query(
        query_texts=[query],
        n_results=n_results,
        where=where_clause
    )

    filter_str = f" [filtered: {risk_tier}]" if risk_tier else ""
    print(f"\nQuery: '{query}'{filter_str}")
    print("-" * 70)
    for i, (doc, dist, meta) in enumerate(zip(
        results['documents'][0],
        results['distances'][0],
        results['metadatas'][0]
    )):
        print(f"  {i+1}. [{meta['risk_tier']:8} | {meta['doc_type']:18} | risk:{meta['risk_score']:3}]")
        print(f"     {doc[:80]}...")
        print(f"     (distance: {dist:.3f})")

# Test queries
filtered_risk_search(collection, "employee personal data", risk_tier="Critical")
filtered_risk_search(collection, "financial transactions and payments")
filtered_risk_search(collection, "medical patient records", risk_tier="Critical")

## Summary

In this lab, you learned how to:

1. **Scan** documents for PII using regex patterns with precision/recall awareness
2. **Extract** named entities with spaCy NER for hybrid PII detection
3. **Score** data assets for compliance risk on a 0-100 scale
4. **Visualise** compliance posture with a multi-chart dashboard
5. **Integrate** risk metadata into a vector catalogue for filtered semantic search

---

*Data Discovery: Harnessing AI, AGI & Vector Databases | AI Elevate*