In [None]:
# Generate a chat-style cybersecurity support tickets dataset (raw + sanitized).
# Each ticket includes a realistic conversation transcript between customer and support.
#
# Outputs (in ./data):
# - cyber_tickets_chat_raw.jsonl
# - cyber_tickets_chat_sanitized.jsonl
#
import json, random, re, uuid, datetime
from pathlib import Path

random.seed(7)

DATA_DIR = Path("data")
DATA_DIR.mkdir(parents=True, exist_ok=True)

first_names = ["Alice","Bob","Carol","David","Eve","Frank","Grace","Heidi","Ivan","Judy","Mallory","Niaj","Olivia","Peggy","Rupert","Sybil","Trent","Uma","Victor","Wendy"]
last_names  = ["Nguyen","Garcia","Kim","Singh","Novak","Kowalski","Ibrahim","Santos","Khan","Kovač","Janssen","Ivanov","Hansen","Rossi","Silva","Williams","Brown","Taylor","Martin","Lee"]
domains     = ["acme-corp.com","contoso.com","globex.net","initech.io","umbrella-sec.com"]
orgs        = ["BlueShield Bank","Zephyr Telecom","IronPeak Insurance","NovaHealth","Stratos Energy"]
departments = ["HR","Finance","IT","Security","Support"]
clearances  = ["public","internal","confidential"]
severities  = ["low","medium","high","critical"]

issue_types = [
    "Phishing email reported",
    "VPN connection failing",
    "Malware alert from EDR",
    "Suspicious login from new location",
    "2FA not working",
    "Data loss prevention (DLP) alert",
    "Ransomware IOC inquiry",
    "SIEM false positive",
    "Unusual outbound traffic",
    "Endpoint isolation follow-up",
    "TLS certificate warning",
    "S3 bucket access denied",
    "Firewall rule change request",
    "Compliance evidence request",
    "Incident post-mortem question",
    "Zero-day patch rollout status",
    "Okta group policy confusion",
    "Email quarantine release",
    "Network DNS issues",
    "Device not showing in EDR"
]

assets = [f"WS-{1000+i}" for i in range(60)]

def random_phone():
    return f"+1-{random.randint(200,999)}-{random.randint(200,999)}-{random.randint(1000,9999)}"

def random_account():
    return f"{random.randint(10000000,99999999)}"

def random_ip():
    return ".".join(str(random.randint(1, 254)) for _ in range(4))

def random_datetime():
    base = datetime.datetime(2024, 11, 1)
    delta_days = random.randint(0, 200)
    dt = base + datetime.timedelta(days=delta_days, hours=random.randint(0,23), minutes=random.randint(0,59))
    return dt.isoformat(timespec="minutes")

def make_identity():
    name = f"{random.choice(first_names)} {random.choice(last_names)}"
    email = f"{name.split()[0].lower()}.{name.split()[1].lower()}@{random.choice(domains)}"
    phone = random_phone()
    return name, email, phone

def convo_for_issue(issue, name, asset, ip):
    # Returns list of {role, message} mimicking a resolved support chat.
    c = []
    if issue == "Phishing email reported":
        c = [
            {"role":"customer","message": f"Hi, I received an email that looks suspicious and it asked me to sign in. I'm not sure if it's legit."},
            {"role":"support","message": f"Thanks for contacting support, {name}. Please avoid clicking any links. I'll check your mailbox and the sender reputation."},
            {"role":"support","message": "I've confirmed it's phishing. I've removed the message from your mailbox and blocked the sender across the organization."},
            {"role":"customer","message": "Oh good. I was worried I might have clicked it."},
            {"role":"support","message": "No activity from that link is visible. I've also reset your sessions and you can change your password at your convenience. You're all set."}
        ]
    elif issue == "VPN connection failing":
        c = [
            {"role":"customer","message": "Hello, my VPN keeps failing to connect after I enter my credentials."},
            {"role":"support","message": f"Thanks, {name}. Let’s check your MFA and client config. Please try resyncing your authenticator and confirm your device time is accurate."},
            {"role":"customer","message": "I resynced the app and updated the time. Still failing."},
            {"role":"support","message": "Understood. I’ve refreshed your group membership and pushed a clean profile. Please update the client and try again."},
            {"role":"customer","message": "That worked. I can connect now."},
            {"role":"support","message": "Great. The issue was a stale profile and out-of-sync MFA token. You should be good going forward."}
        ]
    elif issue == "Malware alert from EDR":
        c = [
            {"role":"customer","message":"Hi, my workstation popped up a malware alert from the endpoint agent."},
            {"role":"support","message": f"Thanks for the heads-up. I’m isolating {asset} to prevent any potential spread while we scan."},
            {"role":"support","message":"The scan found and quarantined the malicious file. No persistence was detected."},
            {"role":"customer","message":"Can I start working again?"},
            {"role":"support","message":"Yes. I’ve released isolation and added the indicators to our blocklist. You’re cleared to continue."}
        ]
    elif issue == "Suspicious login from new location":
        c = [
            {"role":"customer","message":"I got an alert about a sign-in from somewhere I’ve never been."},
            {"role":"support","message":"Thanks for reporting. I’ve revoked your active sessions and required a password reset."},
            {"role":"support","message":"I also enabled an additional conditional access rule to reduce risky sign-ins. Please confirm you can log in now."},
            {"role":"customer","message":"I can. Thanks for the quick help."},
            {"role":"support","message":"All set. We’ll monitor for further attempts and tune geo policies if needed."}
        ]
    elif issue == "2FA not working":
        c = [
            {"role":"customer","message":"My authenticator codes are being rejected."},
            {"role":"support","message":"Please check that your device time is synced automatically and update the app to the latest version."},
            {"role":"customer","message":"Time was off. I've adjusted it, but still no luck."},
            {"role":"support","message":"No worries. I’ve reset your 2FA enrollment. Please re-register your device and try again."},
            {"role":"customer","message":"That did it. I’m back in."},
            {"role":"support","message":"Great. Time drift was the cause. You should be good to go."}
        ]
    elif issue == "Data loss prevention (DLP) alert":
        c = [
            {"role":"customer","message":"I got a DLP warning when sending a spreadsheet to a vendor."},
            {"role":"support","message":"The policy flagged potential sensitive content. I’ve reviewed the logs and the file did contain restricted terms."},
            {"role":"support","message":"I’ve shared a secure transfer link and added an exception for the vendor domain for this case. Please resend using the secure link."},
            {"role":"customer","message":"Resent with the link. No warning this time."},
            {"role":"support","message":"Perfect. We’ll keep the exception scoped and time-bound."}
        ]
    elif issue == "Ransomware IOC inquiry":
        c = [
            {"role":"customer","message":"We saw a report about new ransomware indicators. Do we need to do anything on our side?"},
            {"role":"support","message":"I’ve added the indicators to our detections and scanned recent telemetry across endpoints."},
            {"role":"support","message":"No matches were found. Your environment is clear for those indicators."},
            {"role":"customer","message":"Thanks. Please keep us posted if anything changes."},
            {"role":"support","message":"Will do. Continuous monitoring is in place."}
        ]
    elif issue == "SIEM false positive":
        c = [
            {"role":"customer","message":"We keep getting an alert every hour that doesn’t seem to be real."},
            {"role":"support","message":"I’m reviewing the rule and recent event samples. It looks like a benign pattern that the rule didn’t account for."},
            {"role":"support","message":"I’ve tuned the rule to reduce noise without losing coverage. The alert volume should drop now."},
            {"role":"customer","message":"Much quieter already. Thank you."},
            {"role":"support","message":"Glad to hear it. We’ll keep an eye on it today."}
        ]
    elif issue == "Unusual outbound traffic":
        c = [
            {"role":"customer","message":"Our firewall shows unexpected outbound connections."},
            {"role":"support","message":"I’m correlating with DNS and endpoint data now. The traffic maps to a software updater."},
            {"role":"support","message":"I’ve added the domains to the allowlist after verification. The connections are legitimate."},
            {"role":"customer","message":"That explains it. Thanks."},
            {"role":"support","message":"All good. We’ve documented the exception and set a reminder to review it."}
        ]
    elif issue == "Endpoint isolation follow-up":
        c = [
            {"role":"customer","message":"My device was isolated earlier. Can I get it back online?"},
            {"role":"support","message":f"I’m running a final scan on {asset} to be sure."},
            {"role":"support","message":"Scan is clean and patches are up to date. I’m releasing isolation now."},
            {"role":"customer","message":"I’m online again. Thanks."},
            {"role":"support","message":"Great. Let us know if anything unusual pops up."}
        ]
    elif issue == "TLS certificate warning":
        c = [
            {"role":"customer","message":"Browsers are warning about our site’s certificate."},
            {"role":"support","message":"I’ve checked the chain and the hostname coverage. The intermediate was missing and the SAN list needed an update."},
            {"role":"support","message":"I installed the full chain and renewed the cert. Please try again."},
            {"role":"customer","message":"Warning is gone now."},
            {"role":"support","message":"Perfect. Renewal automation is set to prevent this in the future."}
        ]
    elif issue == "S3 bucket access denied":
        c = [
            {"role":"customer","message":"I can’t access an S3 bucket I used yesterday."},
            {"role":"support","message":"Checking the bucket policy and your IAM role… There’s a deny statement that overrides your allow."},
            {"role":"support","message":"I’ve corrected the policy and added least-privilege access. Please try again."},
            {"role":"customer","message":"Access works now."},
            {"role":"support","message":"Great. Logging is enabled for auditing purposes."}
        ]
    elif issue == "Firewall rule change request":
        c = [
            {"role":"customer","message":"We need outbound access to a new service for our app."},
            {"role":"support","message":"Understood. I’ll stage a temporary rule and test connectivity first."},
            {"role":"support","message":"Tests passed. I’ve deployed the change with a rollback plan and documented the owner and review date."},
            {"role":"customer","message":"Connection succeeds now."},
            {"role":"support","message":"Excellent. The rule is time-bound and will be reviewed."}
        ]
    elif issue == "Compliance evidence request":
        c = [
            {"role":"customer","message":"Auditors are asking for evidence of patch compliance."},
            {"role":"support","message":"I’ve exported last month’s patch reports and control summaries."},
            {"role":"support","message":"I’ll deliver them via the secure portal and tag them for your audit case."},
            {"role":"customer","message":"Received—thanks for the quick turnaround."},
            {"role":"support","message":"Any follow-up items, let us know."}
        ]
    elif issue == "Incident post-mortem question":
        c = [
            {"role":"customer","message":"Do we know the root cause from last week’s incident?"},
            {"role":"support","message":"Yes. A configuration drift on a gateway triggered a cascade of errors."},
            {"role":"support","message":"The configuration has been corrected and guardrails added. I’ve shared the summary to the incident record."},
            {"role":"customer","message":"Thanks for closing the loop."},
            {"role":"support","message":"Any more questions, I’m here to help."}
        ]
    elif issue == "Zero-day patch rollout status":
        c = [
            {"role":"customer","message":"What’s the status on the new zero-day patch?"},
            {"role":"support","message":"Pilot is complete with no regressions. Broad deployment is at 60% and finishing today."},
            {"role":"support","message":"I’ll update the dashboard and notify you when it reaches 100%."},
            {"role":"customer","message":"Sounds good. Please keep us posted."},
            {"role":"support","message":"Will do."}
        ]
    elif issue == "Okta group policy confusion":
        c = [
            {"role":"customer","message":"Some users aren’t getting expected access in Okta groups."},
            {"role":"support","message":"I’m reviewing assignments and rules. The dynamic group filter excluded a subset unintentionally."},
            {"role":"support","message":"I corrected the filter and re-evaluated memberships. Access should reflect properly now."},
            {"role":"customer","message":"Confirmed—access is fixed."},
            {"role":"support","message":"Great. I’ll add a validation to catch that condition early."}
        ]
    elif issue == "Email quarantine release":
        c = [
            {"role":"customer","message":"A legitimate email got stuck in quarantine."},
            {"role":"support","message":"I’ve reviewed the message and released it. The score was borderline."},
            {"role":"support","message":"I’ve adjusted the rule slightly and added the sender to a safe list after verification."},
            {"role":"customer","message":"Thanks, I received it now."},
            {"role":"support","message":"Glad it’s resolved."}
        ]
    elif issue == "Network DNS issues":
        c = [
            {"role":"customer","message":"We can’t resolve some domains intermittently."},
            {"role":"support","message":"I’m checking resolver health and upstream latency."},
            {"role":"support","message":"We saw a spike at one upstream. I’ve switched the path and added caching policy tweaks."},
            {"role":"customer","message":"Resolution is stable again."},
            {"role":"support","message":"Good to hear. We’ll keep monitoring."}
        ]
    elif issue == "Device not showing in EDR":
        c = [
            {"role":"customer","message":"One device isn’t appearing in the EDR console."},
            {"role":"support","message":"I’m verifying the sensor status and enrollment key."},
            {"role":"support","message":"The sensor was outdated. I’ve pushed a reinstall and confirmed the device checks in now."},
            {"role":"customer","message":"I can see it on our side as well."},
            {"role":"support","message":"All set. We also added a job to alert on stale sensors."}
        ]
    else:
        c = [
            {"role":"customer","message":"We’re seeing an issue in our environment."},
            {"role":"support","message":"I’m investigating the logs and recent changes."},
            {"role":"support","message":"A configuration fix has been applied and monitoring is in place."},
            {"role":"customer","message":"Issue resolved on our end."},
            {"role":"support","message":"Glad to hear it. We’ll keep watch."}
        ]
    return c

def make_ticket(i: int):
    name, email, phone = make_identity()
    account = random_account()
    ip = random_ip()
    org = random.choice(orgs)
    issue = random.choice(issue_types)
    asset = random.choice(assets)
    body = f"Customer {name} from {org} reports: {issue} on asset {asset}. Contact {email}, phone {phone}. Reference account {account}. Source IP seen {ip}."
    convo = convo_for_issue(issue, name, asset, ip)
    doc = {
        "ticket_id": str(uuid.uuid4())[:8],
        "created_at": random_datetime(),
        "org": org,
        "department": random.choice(departments),
        "clearance": random.choice(clearances),
        "severity": random.choice(severities),
        "issue_type": issue,
        "customer_id": f"CUST-{random.randint(1000,9999)}",
        "tags": random.sample(["phishing","vpn","edr","siem","okta","network","dlp","s3","tls","compliance","2fa","firewall","ransomware","ioc","dns"], k=3),
        "contact_name": name,
        "contact_email": email,
        "contact_phone": phone,
        "account_number": account,
        "source_ip": ip,
        "asset": asset,
        "body": body,
        "conversation": convo
    }
    return doc

tickets = [make_ticket(i) for i in range(20)]

raw_path = DATA_DIR / "cyber_tickets_chat_raw.jsonl"
with raw_path.open("w", encoding="utf-8") as f:
    for t in tickets:
        f.write(json.dumps(t, ensure_ascii=False) + "\n")

# --- Sanitization ---
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
PHONE_RE = re.compile(r"\+?\d{1,3}[-.\s]?\d{2,3}[-.\s]?\d{3}[-.\s]?\d{4}")
ACCOUNT_RE = re.compile(r"\b\d{8,12}\b")
IP_RE = re.compile(r"\b(?:(?:2[0-5]{2}|1?\d{1,2})\.){3}(?:2[0-5]{2}|1?\d{1,2})\b")
NAME_RE = re.compile(r"\b([A-Z][a-z]+)\s([A-Z][a-zÁ-ž]+)\b")

def anonymize_text(s: str, mapping: dict, ticket_id: str) -> str:
    def repl_email(m):
        key = f"{ticket_id}:email:{m.group(0)}"
        mapping.setdefault(key, f"EMAIL-{len(mapping)+1}")
        return mapping[key]
    s = EMAIL_RE.sub(repl_email, s)

    def repl_phone(m):
        key = f"{ticket_id}:phone:{m.group(0)}"
        mapping.setdefault(key, f"PHONE-{len(mapping)+1}")
        return mapping[key]
    s = PHONE_RE.sub(repl_phone, s)

    def repl_account(m):
        key = f"{ticket_id}:acct:{m.group(0)}"
        mapping.setdefault(key, f"ACCT-{len(mapping)+1}")
        return mapping[key]
    s = ACCOUNT_RE.sub(repl_account, s)

    def repl_ip(m):
        key = f"{ticket_id}:ip:{m.group(0)}"
        mapping.setdefault(key, f"IP-{len(mapping)+1}")
        return mapping[key]
    s = IP_RE.sub(repl_ip, s)

    def repl_name(m):
        key = f"{ticket_id}:name:{m.group(0)}"
        mapping.setdefault(key, f"CUSTOMER-{len(mapping)+1}")
        return mapping[key]
    s = NAME_RE.sub(repl_name, s)
    return s

sanitized = []
mapping = {}
for t in tickets:
    t2 = dict(t)
    for field in ["contact_name","contact_email","contact_phone","account_number","source_ip","body"]:
        t2[field] = anonymize_text(str(t2[field]), mapping, t["ticket_id"])
    conv2 = []
    for turn in t["conversation"]:
        m = dict(turn)
        m["message"] = anonymize_text(m["message"], mapping, t["ticket_id"])
        conv2.append(m)
    t2["conversation"] = conv2
    sanitized.append(t2)

san_path = DATA_DIR / "cyber_tickets_chat_sanitized.jsonl"
with san_path.open("w", encoding="utf-8") as f:
    for t in sanitized:
        f.write(json.dumps(t, ensure_ascii=False) + "\n")

print("Wrote:", raw_path, "and", san_path)


In [None]:
DATA_DIR = Path("data")
raw_path = DATA_DIR / "cyber_tickets_chat_raw.jsonl"
san_path = DATA_DIR / "cyber_tickets_chat_sanitized.jsonl"

def load_jsonl(p):
    return [json.loads(x) for x in p.read_text(encoding="utf-8").splitlines()]

raw_docs = load_jsonl(raw_path)

In [None]:
import re, ipaddress, hashlib
from datetime import datetime, timezone

# --- helpers ---------------------------------------------------------------
def org_to_domain(org: str) -> str:
    base = re.sub(r'[^a-z0-9]+', '', org.lower())
    # keep simple and predictable for mock data
    tld = "com" if "bank" not in org.lower() else "com"
    return f"{base}.{tld}"

def ensure_iso(ts: str) -> str:
    # Accept 'YYYY-MM-DDTHH:MM' or '...:SS' -> return Zulu
    m = re.match(r'^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2})(?::(\d{2}))?$', ts)
    if m:
        s = m.group(1) + (f":{m.group(2)}" if m.group(2) else ":00")
        return datetime.fromisoformat(s).replace(tzinfo=timezone.utc).isoformat().replace("+00:00","Z")
    # If already ISO-ish, try parse and force Z
    try:
        return datetime.fromisoformat(ts.replace("Z","")).replace(tzinfo=timezone.utc).isoformat().replace("+00:00","Z")
    except Exception:
        return datetime.now(timezone.utc).isoformat().replace("+00:00","Z")

def is_e164(phone: str) -> bool:
    return bool(re.fullmatch(r'^\+[1-9]\d{1,14}$', phone))

def redact(text: str) -> str:
    text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', '[EMAIL]', text)
    text = re.sub(r'\+?[1-9]\d{7,14}', '[PHONE]', text)
    text = re.sub(r'\b\d{6,}\b', '[ACCT]', text)
    text = re.sub(r'\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d?\d)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d?\d)\b', '[IP]', text)
    return text

def valid_ipv4(ip: str) -> bool:
    try:
        ipaddress.IPv4Address(ip); return True
    except: return False

def normalize_tags(issue_type: str, tags: list) -> list:
    base = set()
    s = issue_type.lower()
    if "edr" in s or "malware" in s: base |= {"edr","malware"}
    if "login" in s or "mfa" in s or "2fa" in s: base |= {"identity","mfa"}
    if "compliance" in s or "audit" in s: base |= {"compliance","evidence"}
    if "isolation" in s: base |= {"edr","isolation"}
    # keep only controlled vocab + any ioc:* passthroughs
    allowed = {"edr","malware","identity","mfa","compliance","evidence","isolation","vpn","phishing","firewall"}
    passthrough = {t for t in tags if t.startswith("ioc:") or t.startswith("mitre:") or t.startswith("product:")}
    return sorted((base & allowed) | passthrough)

def clearance_to_tlp(clearance: str) -> tuple[str, bool]:
    if clearance.lower() == "public": return ("CLEAR", True)
    if clearance.lower() == "internal": return ("AMBER+STRICT", True)  # visible to customer but controlled
    return ("AMBER", True)

def infer_category(issue_type: str) -> str:
    s = issue_type.lower()
    if "edr" in s or "malware" in s: return "edr"
    if "login" in s or "mfa" in s or "2fa" in s: return "identity"
    if "compliance" in s or "evidence" in s: return "compliance"
    return "other"

def guess_detection(issue_type: str) -> dict:
    s = issue_type.lower()
    if "malware" in s or "edr" in s:
        return {"product":"GenericEDR","rule_id":"EDR-9001","signature":"Suspicious PowerShell","mitre":["T1059.001"]}
    if "login" in s:
        return {"product":"IdP","rule_id":"IDP-4012","signature":"Risky sign-in (impossible travel)","mitre":["T1078"]}
    return {}

# --- main fixer ------------------------------------------------------------
def fix_tickets(tickets: list[dict]) -> list[dict]:
    out = []
    for t in tickets:
        t = dict(t)  # shallow copy

        # 1) normalize timestamps
        t["created_at"] = ensure_iso(t.get("created_at",""))
        # 2) email domains: reporter should match org domain
        reporter_email = t.get("contact_email") or ""
        reporter_name = t.get("contact_name") or ""
        cust_domain = org_to_domain(t.get("org",""))
        # use left part from given email if present, else derive from name
        local = reporter_email.split("@")[0] if "@" in reporter_email else re.sub(r'\s+','.', reporter_name.lower()) or "user"
        t_reporter_email = f"{local}@{cust_domain}"
        t_phone = t.get("contact_phone") or ""
        if not is_e164(t_phone):
            # best-effort normalize: strip non-digits, prepend '+' if plausible
            digits = re.sub(r'\D','', t_phone)
            if digits and not digits.startswith("0"): t_phone = f"+{digits}"
        # 3) clearance→TLP + visibility
        tlp, cust_visible = clearance_to_tlp(t.get("clearance","internal"))
        # 4) category + tags
        t_tags = normalize_tags(t.get("issue_type",""), t.get("tags",[]))
        category = infer_category(t.get("issue_type",""))
        # 5) network/IP sanity
        ip = t.get("source_ip","")
        t_ip = ip if valid_ipv4(ip) else None
        # 6) move PII out of body, create redacted body
        body = t.get("body") or ""
        redacted_body = redact(body)
        # 7) add status/resolution scaffold
        status = t.get("status") or "resolved"
        resolution = t.get("resolution") or {"summary":"Auto-fixed per mock-data pass."}
        # 8) detection + IoCs (stub if missing)
        detection = t.get("detection") or guess_detection(t.get("issue_type",""))
        iocs = t.get("iocs") or {}
        if "sha256" not in iocs and category=="edr":
            # derive a deterministic fake hash from ticket_id
            sha = hashlib.sha256((t.get("ticket_id","") or "seed").encode()).hexdigest()
            iocs["sha256"] = [sha]
        # 9) conversation: add timestamps & authors
        conv = []
        start_ts = datetime.fromisoformat(t["created_at"].replace("Z","+00:00"))
        for i, m in enumerate(t.get("conversation",[])):
            ts = (start_ts.timestamp() + (i+1)*60)  # +1 minute per message
            ts_iso = datetime.fromtimestamp(ts, tz=timezone.utc).isoformat().replace("+00:00","Z")
            role = m.get("role")
            author = t_reporter_email if role=="customer" else "t1.analyst@mssp.example"
            conv.append({
                "ts": ts_iso,
                "role": role,
                "author": author,
                "message": m.get("message","").strip()
            })

        # 10) build new structure (keep your original keys where harmless)
        fixed = {
            **t,
            "category": category,
            "tags": t_tags,
            "tlp": tlp,
            "customer_visible": cust_visible,
            "status": status,
            "reporter": {
                "name": reporter_name,
                "email": t_reporter_email,
                "phone": t_phone,
                "role": "end_user"
            },
            "handler": {"tier": "T1", "email": "t1.analyst@mssp.example"},
            "network": {"source_ip": t_ip, "vpn": "vpn" in t_tags},
            "detection": detection,
            "iocs": iocs,
            "redacted_body": redacted_body,
            "pii_fields": ["reporter.phone","account_number"],
            "resolution": resolution,
            "conversation": conv,
        }

        # clean up legacy fields you don’t want exposed to retriever
        # (optional) comment the next lines if you want to keep them:
        for k in ["contact_name","contact_email","contact_phone","clearance"]:
            fixed.pop(k, None)

        out.append(fixed)
    return out

# --- example usage ---------------------------------------------------------
fixed = fix_tickets(raw_docs)  # where raw_tickets is your list of dicts

In [None]:
for t in fixed:
    email = t.get("reporter", {}).get("email", "")
    phone = t.get("reporter", {}).get("phone", "")
    body = t.get("body") or ""
    body = re.sub(r'[\w\.-]+@[\w\.-]+\.\w+', email, body)
    body = re.sub(r'\+?[1-9][\d\-\s]{7,}', phone, body)
    t["body"] = body
    t["redacted_body"] = redact(body) 

In [None]:
import json

san_path = DATA_DIR / "support_tickets.jsonl"
with san_path.open("w", encoding="utf-8") as f:
    for t in fixed:   # <-- use fixed here
        f.write(json.dumps(t, ensure_ascii=False) + "\n")


In [None]:
# ====== RAW TICKET GENERATOR (customer-visible body) ======
import json, random, re, uuid, datetime
from pathlib import Path

random.seed(7)

DATA_DIR = Path("data")
DATA_DIR.mkdir(parents=True, exist_ok=True)

first_names = ["Alice","Bob","Carol","David","Eve","Frank","Grace","Heidi","Ivan","Judy","Mallory","Niaj","Olivia","Peggy","Rupert","Sybil","Trent","Uma","Victor","Wendy"]
last_names  = ["Nguyen","Garcia","Kim","Singh","Novak","Kowalski","Ibrahim","Santos","Khan","Kovač","Janssen","Ivanov","Hansen","Rossi","Silva","Williams","Brown","Taylor","Martin","Lee"]
orgs        = ["BlueShield Bank","Zephyr Telecom","IronPeak Insurance","NovaHealth","Stratos Energy"]
departments = ["HR","Finance","IT","Security","Support"]
severities  = ["low","medium","high","critical"]

issue_types = [
    "Phishing email reported",
    "VPN connection failing",
    "Malware alert from EDR",
    "Suspicious login from new location",
    "2FA not working",
    "Data loss prevention (DLP) alert",
    "Ransomware IOC inquiry",
    "SIEM false positive",
    "Unusual outbound traffic",
    "Endpoint isolation follow-up",
    "TLS certificate warning",
    "S3 bucket access denied",
    "Firewall rule change request",
    "Compliance evidence request",
    "Incident post-mortem question",
    "Zero-day patch rollout status",
    "Okta group policy confusion",
    "Email quarantine release",
    "Network DNS issues",
    "Device not showing in EDR"
]

assets = [f"WS-{1000+i}" for i in range(60)]

def random_phone():
    # Human-readable format for raw data; you’ll still store E.164 if you prefer.
    return f"+1-{random.randint(200,999)}-{random.randint(200,999)}-{random.randint(1000,9999)}"

def random_account():
    return f"{random.randint(10000000,99999999)}"

def random_ip():
    return ".".join(str(random.randint(1, 254)) for _ in range(4))

def random_datetime():
    base = datetime.datetime(2024, 11, 1)
    delta_days = random.randint(0, 200)
    dt = base + datetime.timedelta(days=delta_days, hours=random.randint(0,23), minutes=random.randint(0,59))
    return dt.isoformat(timespec="minutes")

def org_to_domain(org: str) -> str:
    # simple brand-ish domain from org name
    base = re.sub(r'[^a-z0-9]+', '', org.lower())
    return f"{base}.com"

def make_identity(org: str):
    name = f"{random.choice(first_names)} {random.choice(last_names)}"
    local = f"{name.split()[0].lower()}.{name.split()[1].lower()}"
    email = f"{local}@{org_to_domain(org)}"   # align to customer org
    phone = random_phone()
    return name, email, phone

# ----- Conversations: remove names; keep neutral language -----
def convo_for_issue(issue, asset):
    c = []
    if issue == "Phishing email reported":
        c = [
            {"role":"customer","message":"Hi, I received an email that looks suspicious and it asked me to sign in. I'm not sure if it's legit."},
            {"role":"support","message":"Thanks for contacting support. Please avoid clicking any links. I’ll check the mailbox and sender reputation."},
            {"role":"support","message":"It’s confirmed phishing. The message was removed and the sender blocked across the organization."},
            {"role":"customer","message":"Oh good. I was worried I might have clicked it."},
            {"role":"support","message":"No activity from that link is visible. Sessions were reset; you can change your password at your convenience."}
        ]
    elif issue == "VPN connection failing":
        c = [
            {"role":"customer","message":"Hello, my VPN keeps failing to connect after I enter my credentials."},
            {"role":"support","message":"Let’s check MFA and client configuration. Please resync your authenticator and confirm device time is accurate."},
            {"role":"customer","message":"I resynced the app and updated the time. Still failing."},
            {"role":"support","message":"Group membership was refreshed and a clean profile pushed. Please update the client and try again."},
            {"role":"customer","message":"That worked. I can connect now."},
            {"role":"support","message":"Great. The issue was a stale profile and out-of-sync MFA token."}
        ]
    elif issue == "Malware alert from EDR":
        c = [
            {"role":"customer","message":"Hi, my workstation popped up a malware alert from the endpoint agent."},
            {"role":"support","message":f"Isolating {asset} to prevent potential spread while scans run."},
            {"role":"support","message":"Scan quarantined a malicious file. No persistence detected."},
            {"role":"customer","message":"Can I start working again?"},
            {"role":"support","message":"Isolation released and indicators added to the blocklist. You’re cleared to continue."}
        ]
    elif issue == "Suspicious login from new location":
        c = [
            {"role":"customer","message":"I got an alert about a sign-in from somewhere I’ve never been."},
            {"role":"support","message":"Active sessions revoked and a password reset required."},
            {"role":"support","message":"An additional conditional access rule was enabled to reduce risky sign-ins. Please confirm access now."},
            {"role":"customer","message":"I can log in. Thanks for the quick help."},
            {"role":"support","message":"We’ll monitor for further attempts and tune geo policies if needed."}
        ]
    elif issue == "2FA not working":
        c = [
            {"role":"customer","message":"My authenticator codes are being rejected."},
            {"role":"support","message":"Ensure device time is synced automatically and update the app to the latest version."},
            {"role":"customer","message":"Time was off. I adjusted it, but still no luck."},
            {"role":"support","message":"Enrollment was reset. Please re-register your device and try again."},
            {"role":"customer","message":"That did it. I’m back in."},
            {"role":"support","message":"Great. Time drift was the cause."}
        ]
    elif issue == "Data loss prevention (DLP) alert":
        c = [
            {"role":"customer","message":"I got a DLP warning when sending a spreadsheet to a vendor."},
            {"role":"support","message":"The policy flagged potential sensitive content. Logs indicate restricted terms."},
            {"role":"support","message":"A secure transfer link was shared; a scoped exception was added for this case. Please resend via the link."},
            {"role":"customer","message":"Resent with the link. No warning this time."},
            {"role":"support","message":"Perfect. The exception is time-bound."}
        ]
    elif issue == "Ransomware IOC inquiry":
        c = [
            {"role":"customer","message":"We saw a report about new ransomware indicators. Do we need to do anything on our side?"},
            {"role":"support","message":"Indicators were added to detections and recent telemetry scanned across endpoints."},
            {"role":"support","message":"No matches were found. Environment is clear for those indicators."},
            {"role":"customer","message":"Thanks. Please keep us posted if anything changes."},
            {"role":"support","message":"Will do. Continuous monitoring is in place."}
        ]
    elif issue == "SIEM false positive":
        c = [
            {"role":"customer","message":"We keep getting an alert every hour that doesn’t seem to be real."},
            {"role":"support","message":"Rule and samples reviewed. Pattern appears benign and not accounted for by the rule."},
            {"role":"support","message":"Rule tuned to reduce noise without losing coverage. Alert volume should drop."},
            {"role":"customer","message":"Much quieter already. Thank you."},
            {"role":"support","message":"We’ll keep an eye on it today."}
        ]
    elif issue == "Unusual outbound traffic":
        c = [
            {"role":"customer","message":"Our firewall shows unexpected outbound connections."},
            {"role":"support","message":"Correlating with DNS and endpoint data. Traffic maps to a software updater."},
            {"role":"support","message":"Domains added to allowlist after verification. Connections are legitimate."},
            {"role":"customer","message":"That explains it. Thanks."},
            {"role":"support","message":"Exception documented with a review date."}
        ]
    elif issue == "Endpoint isolation follow-up":
        c = [
            {"role":"customer","message":"My device was isolated earlier. Can I get it back online?"},
            {"role":"support","message":f"Running a final scan on {asset} to be sure."},
            {"role":"support","message":"Scan is clean and patches are up to date. Releasing isolation now."},
            {"role":"customer","message":"I’m online again. Thanks."},
            {"role":"support","message":"Great. Let us know if anything unusual pops up."}
        ]
    elif issue == "TLS certificate warning":
        c = [
            {"role":"customer","message":"Browsers are warning about our site’s certificate."},
            {"role":"support","message":"Chain and hostname coverage checked. Intermediate was missing; SAN list needed an update."},
            {"role":"support","message":"Installed full chain and renewed the cert. Please try again."},
            {"role":"customer","message":"Warning is gone now."},
            {"role":"support","message":"Renewal automation configured to prevent recurrence."}
        ]
    elif issue == "S3 bucket access denied":
        c = [
            {"role":"customer","message":"I can’t access an S3 bucket I used yesterday."},
            {"role":"support","message":"Bucket policy and IAM role reviewed. A deny statement overrode allow."},
            {"role":"support","message":"Policy corrected and least-privilege access applied. Please try again."},
            {"role":"customer","message":"Access works now."},
            {"role":"support","message":"Logging enabled for auditing."}
        ]
    elif issue == "Firewall rule change request":
        c = [
            {"role":"customer","message":"We need outbound access to a new service for our app."},
            {"role":"support","message":"A temporary rule will be staged and connectivity tested first."},
            {"role":"support","message":"Tests passed. Change deployed with rollback plan and documented owner + review date."},
            {"role":"customer","message":"Connection succeeds now."},
            {"role":"support","message":"The rule is time-bound and will be reviewed."}
        ]
    elif issue == "Compliance evidence request":
        c = [
            {"role":"customer","message":"Auditors are asking for evidence of patch compliance."},
            {"role":"support","message":"Last month’s patch reports and control summaries exported."},
            {"role":"support","message":"They’ll be delivered via the secure portal and tagged to the audit case."},
            {"role":"customer","message":"Received—thanks for the quick turnaround."},
            {"role":"support","message":"Any follow-up items, let us know."}
        ]
    elif issue == "Incident post-mortem question":
        c = [
            {"role":"customer","message":"Do we know the root cause from last week’s incident?"},
            {"role":"support","message":"A configuration drift on a gateway triggered a cascade of errors."},
            {"role":"support","message":"Configuration corrected and guardrails added. Summary attached to the incident record."},
            {"role":"customer","message":"Thanks for closing the loop."},
            {"role":"support","message":"Happy to help."}
        ]
    elif issue == "Zero-day patch rollout status":
        c = [
            {"role":"customer","message":"What’s the status on the new zero-day patch?"},
            {"role":"support","message":"Pilot complete with no regressions. Broad deployment at ~60% and finishing today."},
            {"role":"support","message":"Dashboard will be updated; we’ll notify at 100%."},
            {"role":"customer","message":"Sounds good. Please keep us posted."},
            {"role":"support","message":"Will do."}
        ]
    elif issue == "Okta group policy confusion":
        c = [
            {"role":"customer","message":"Some users aren’t getting expected access in Okta groups."},
            {"role":"support","message":"Assignments and rules reviewed. Dynamic group filter excluded a subset unintentionally."},
            {"role":"support","message":"Filter corrected and memberships re-evaluated. Access should reflect properly now."},
            {"role":"customer","message":"Confirmed—access is fixed."},
            {"role":"support","message":"A validation will be added to catch this condition early."}
        ]
    elif issue == "Email quarantine release":
        c = [
            {"role":"customer","message":"A legitimate email got stuck in quarantine."},
            {"role":"support","message":"Message reviewed and released; score was borderline."},
            {"role":"support","message":"Rule adjusted slightly; sender added to a safe list after verification."},
            {"role":"customer","message":"Thanks, I received it now."},
            {"role":"support","message":"Glad it’s resolved."}
        ]
    elif issue == "Network DNS issues":
        c = [
            {"role":"customer","message":"We can’t resolve some domains intermittently."},
            {"role":"support","message":"Resolver health and upstream latency checked."},
            {"role":"support","message":"A spike at one upstream observed; path switched and caching tweaked."},
            {"role":"customer","message":"Resolution is stable again."},
            {"role":"support","message":"We’ll keep monitoring."}
        ]
    elif issue == "Device not showing in EDR":
        c = [
            {"role":"customer","message":"One device isn’t appearing in the EDR console."},
            {"role":"support","message":"Sensor status and enrollment key verified."},
            {"role":"support","message":"Sensor was outdated; reinstall pushed and device checks in now."},
            {"role":"customer","message":"I can see it on our side as well."},
            {"role":"support","message":"A job was added to alert on stale sensors."}
        ]
    else:
        c = [
            {"role":"customer","message":"We’re seeing an issue in our environment."},
            {"role":"support","message":"Investigating logs and recent changes."},
            {"role":"support","message":"A configuration fix has been applied and monitoring is in place."},
            {"role":"customer","message":"Issue resolved on our end."},
            {"role":"support","message":"We’ll keep watch."}
        ]
    return c

# ----- Customer-visible body: concise, PII-free narrative -----
def body_customer_visible(issue, asset):
    # short summary matching your “After (customer-visible)” style
    templates = {
        "Phishing email reported": "Investigated a suspicious email. Confirmed phishing, removed the message, and blocked the sender. Sessions were reset as a precaution.",
        "VPN connection failing": "Troubleshot VPN authentication and profile. Resynced MFA, refreshed profile, and confirmed successful connection.",
        "Malware alert from EDR": f"Investigated endpoint alert. {asset} was isolated during scanning; malicious file quarantined; no persistence observed. Isolation released.",
        "Suspicious login from new location": "Addressed risky sign-in. Revoked sessions, required password reset, and tightened conditional access. Access restored.",
        "2FA not working": "Resolved 2FA failures by correcting device time and re-enrolling the authenticator. Login successful.",
        "Data loss prevention (DLP) alert": "Reviewed DLP event; content matched restricted terms. Provided secure transfer workflow and scoped exception for this case.",
        "Ransomware IOC inquiry": "Loaded new ransomware IOCs and scanned recent telemetry. No matches found; monitoring continues.",
        "SIEM false positive": "Analyzed recurring alert; tuned rule to reduce noise without losing coverage. Alert volume decreased.",
        "Unusual outbound traffic": "Correlated outbound connections with DNS/endpoint data; identified legitimate updater. Documented a scoped allowlist.",
        "Endpoint isolation follow-up": f"Final scan completed on {asset}; patches current. Isolation released and monitoring in place.",
        "TLS certificate warning": "Validated certificate chain and SANs; installed full chain and renewed cert. Warnings cleared.",
        "S3 bucket access denied": "Reviewed IAM and bucket policy; corrected deny condition. Access restored and auditing enabled.",
        "Firewall rule change request": "Staged and tested outbound rule; deployed with rollback plan and review date. Connectivity established.",
        "Compliance evidence request": "Prepared patch evidence and control summaries; delivered via secure portal with case tagging.",
        "Incident post-mortem question": "Identified root cause as configuration drift; corrected and added guardrails. Shared summary in incident record.",
        "Zero-day patch rollout status": "Zero-day rollout progressing; pilot complete, broad deployment underway. Dashboard to be updated at completion.",
        "Okta group policy confusion": "Investigated group membership mismatch; corrected dynamic filter and re-evaluated memberships. Access reflects expected roles.",
        "Email quarantine release": "Reviewed quarantined message; released after verification and adjusted scoring. Sender added to safe list.",
        "Network DNS issues": "Investigated intermittent resolution; rerouted upstream and tweaked caching. Resolution stabilized.",
        "Device not showing in EDR": "Verified sensor enrollment; pushed reinstall and confirmed check-in. Added alerting for stale sensors."
    }
    return templates.get(issue, "Issue investigated; configuration corrected; monitoring in place.")

def make_ticket(i: int):
    org = random.choice(orgs)
    name, email, phone = make_identity(org)   # email aligns with org
    account = random_account()
    ip = random_ip()
    issue = random.choice(issue_types)
    asset = random.choice(assets)

    # NEW: PII-free, customer-visible narrative
    body = body_customer_visible(issue, asset)

    # Conversation with neutral language (no names/emails/phones)
    convo = convo_for_issue(issue, asset)

    doc = {
        "ticket_id": str(uuid.uuid4())[:8],
        "created_at": random_datetime(),
        "org": org,
        "department": random.choice(departments),
        "severity": random.choice(severities),
        "issue_type": issue,
        "customer_id": f"CUST-{random.randint(1000,9999)}",
        "tags": random.sample(["phishing","vpn","edr","siem","okta","network","dlp","s3","tls","compliance","2fa","firewall","ransomware","ioc","dns"], k=3),

        # structured PII only here:
        "contact_name": name,
        "contact_email": email,
        "contact_phone": phone,
        "account_number": account,
        "source_ip": ip,

        "asset": asset,
        "body": body,
        "conversation": convo
    }
    return doc

tickets = [make_ticket(i) for i in range(20)]

raw_path = DATA_DIR / "support_tickets_raw.jsonl"
with raw_path.open("w", encoding="utf-8") as f:
    for t in tickets:
        f.write(json.dumps(t, ensure_ascii=False) + "\n")

print("Wrote:", raw_path)
# ====== END RAW GENERATOR ======


In [None]:
import json
from pathlib import Path

# Remove clearance
for t in tickets:
    t.pop("clearance", None)  # safe removal

# Save back to JSONL
out_path = Path("data/support_tickets_raw.jsonl")
with out_path.open("w", encoding="utf-8") as f:
    for t in fixed:
        f.write(json.dumps(t, ensure_ascii=False) + "\n")

print("Wrote:", out_path)