In [None]:
# =============================
# 1) Setup imports
# =============================
import re, json, subprocess, sys
import spacy
from typing import List, Dict, Any
from transformers import pipeline

# =============================
# 2) spaCy loader with fallback
# =============================
def load_spacy_model():
    try:
        print("🔍 Trying to load en_core_web_trf...")
        return spacy.load("en_core_web_trf")
    except OSError:
        print("⚠️ en_core_web_trf not found. Installing fallback en_core_web_sm...")
        subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
        return spacy.load("en_core_web_sm")

nlp = load_spacy_model()

# =============================
# 3) Hugging Face pipelines
# =============================
hf_ner = pipeline(
    "ner",
    model="dslim/bert-base-NER",
    aggregation_strategy="simple"
)

sentiment = pipeline(
    "sentiment-analysis",
    model="distilbert/distilbert-base-uncased-finetuned-sst-2-english"
)

# =============================
# 4) Regex PII patterns
# =============================
PII_PATTERNS = {
    "email": re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b'),
    "phone": re.compile(r'\b(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{6,10}\b'),
    "address_like": re.compile(r'\b\d{1,5}\s+(?:[A-Za-z0-9]+\s){0,4}(?:street|st|road|rd|avenue|ave|lane|ln|boulevard|blvd|drive|dr)\b', re.I),
    "url": re.compile(r'\bhttps?://[^\s]+\b'),
    "credit_card_like": re.compile(r'\b(?:\d[ -]*?){13,16}\b'),
}

def detect_pii_regex(text: str) -> List[Dict[str,str]]:
    hits = []
    for label, patt in PII_PATTERNS.items():
        for m in patt.finditer(text):
            hits.append({"type": label, "text": m.group(0), "span": [m.start(), m.end()]})
    return hits

# =============================
# 5) Entity extractors
# =============================
def run_spacy_ner(text: str):
    doc = nlp(text)
    return [{"source":"spacy", "label": ent.label_, "text": ent.text, "span":[ent.start_char, ent.end_char]} for ent in doc.ents]

def run_hf_ner(text: str):
    out = hf_ner(text)
    merged = []
    current = None

    # Merge sub-tokens (e.g., In + ##fosys → Infosys)
    for e in out:
        label = e["entity_group"]
        word = e["word"].replace("##", "")
        if current and current["label"] == label:
            current["text"] += word
            current["score"] = max(current["score"], e["score"])
        else:
            if current:
                merged.append(current)
            current = {"source":"hf", "label": label, "text": word, "score": e["score"]}
    if current:
        merged.append(current)

    return merged

def extract_evidence(text: str):
    evidence = []
    evidence.extend(detect_pii_regex(text))
    evidence.extend(run_spacy_ner(text))
    evidence.extend(run_hf_ner(text))
    return evidence

# =============================
# 6) Scoring system
# =============================
WEIGHTS = {
    "email": 25, "phone": 25, "credit_card_like": 40, "url": 10, "address_like": 20,
    "PERSON": 15, "GPE": 15, "LOC": 12, "ORG": 12, "DATE": 8, "TIME": 6, "MONEY": 20,
    "default_entity": 5,
}

def compute_exposure_score(evidence):
    total, details = 0, []
    for e in evidence:
        t = e.get("type") or e.get("label") or ""
        w = WEIGHTS.get(t, WEIGHTS.get(t.upper(), WEIGHTS["default_entity"]))
        score_factor = float(e.get("score", 1.0))
        text_len_factor = min(1.0, len(e.get("text","")) / 30.0 + 0.2)
        add = w * score_factor * text_len_factor
        details.append({"evidence": e, "weight": w, "factor": score_factor * text_len_factor, "contribution": add})
        total += add
    return {"raw_score": total, "exposure_score": min(100, round(total, 2)), "details": details}

# =============================
# 7) Analyzer
# =============================
def analyze_text(text: str):
    if not text.strip():
        return {"error":"empty text"}
    evidence = extract_evidence(text)
    score = compute_exposure_score(evidence)
    return {"input_text": text, "sentiment": sentiment(text[:512]), "evidence": evidence, "score": score}

# =============================
# 8) Pretty printing
# =============================
def pretty_print_result(res):
    if "error" in res: print(res["error"]); return
    print("=== Exposure Score:", res["score"]["exposure_score"], "/100 ===\n")
    print("Sentiment:", res["sentiment"])
    print("\nEvidence found:")
    for d in res["score"]["details"]:
        e = d["evidence"]
        print(f"- [{e.get('source','regex')}] {e.get('text')} ({e.get('type') or e.get('label')}) → +{d['contribution']:.2f}")

# =============================
# 9) Test run
# =============================
sample = """
My name is Srinivas, and I have been working at Infosys Technologies in Bangalore.
You can reach me at srinivas.reddy@example.com or call me at +91 98765 12345.
Every morning, I leave my apartment around 8:00 AM to go to the office.
I traveled to New York on 12th August 2023 for a meeting.
Check my site: https://srinivasportfolio.com
"""
res = analyze_text(sample)
pretty_print_result(res)

# =============================
# 10) (Optional) Gradio UI
# =============================
import gradio as gr

def gradio_analyze(text):
    res = analyze_text(text)
    if "error" in res:
        return "<b>No text provided.</b>"
    html = f"<h3>Exposure Score: {res['score']['exposure_score']}/100</h3>"
    html += "<h4>Evidence</h4><ul>"
    for d in res["score"]["details"]:
        e = d["evidence"]
        label = e.get('type') or e.get('label') or 'unknown'
        html += f"<li><b>{label}</b>: {e.get('text')}</li>"
    html += "</ul>"
    html += f"<h4>Sentiment</h4><pre>{res['sentiment']}</pre>"
    return html

# ✅ Launch Gradio properly
demo = gr.Interface(
    fn=gradio_analyze,
    inputs=gr.Textbox(lines=6, placeholder="Enter your text here..."),
    outputs="html",
    title="Digital Shadow — Text Analyzer"
)

# For Colab use share=True, for local just demo.launch()
demo.launch(share=True, debug=True)



🔍 Trying to load en_core_web_trf...
⚠️ en_core_web_trf not found. Installing fallback en_core_web_sm...


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
Device set to use cpu


=== Exposure Score: 100 /100 ===

Sentiment: [{'label': 'NEGATIVE', 'score': 0.8548497557640076}]

Evidence found:
- [regex] srinivas.reddy@example.com (email) → +25.00
- [regex] https://srinivasportfolio.com (url) → +10.00
- [spacy] Srinivas (PERSON) → +7.00
- [spacy] Infosys Technologies (ORG) → +10.40
- [spacy] Bangalore (PERSON) → +7.50
- [spacy] 98765 12345 (DATE) → +4.53
- [spacy] 8:00 AM (TIME) → +2.60
- [spacy] New York (GPE) → +7.00
- [spacy] 12th August 2023 (DATE) → +5.87
- [hf] Srini (PER) → +1.47
- [hf] Infosys Technologies (ORG) → +10.39
- [hf] BangaloreNew York (LOC) → +9.19
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://c84a49be25fb927c81.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/sp