In [1]:
import os
from pathlib import Path


OUTPUT_DIR = Path("emails")

if OUTPUT_DIR.exists():
    for path in OUTPUT_DIR.glob("*"):
        if path.is_file():
            path.unlink()
else:
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Email output folder ready at: {OUTPUT_DIR.resolve()}")


Email output folder ready at: /Users/samueleko/Library/Mobile Documents/com~apple~CloudDocs/UNT/CSCE 5412/projects/spamharm/machine_learning_section/emails


In [2]:
import random

phishing_keywords = [
    "your account has been suspended",
    "your account has been locked",
    "unusual activity on your account",
    "verify your account",
    "verify your identity",
    "confirm your account details",
    "update your account information",
    "your account will be closed",
    "your account will be deactivated",
    "unauthorized login attempt",
    "we detected a login from a new device",
    "immediate action required",
    "urgent: action required",
    "security alert",
    "final notice",
    "login to your account",
    "sign in to your account",
    "re-enter your password",
    "confirm your password",
    "enter your PIN",
    "enter your one-time code",
    "enter your verification code",
    "2-step verification code",
    "two factor authentication code",
    "provide your social security number",
    "provide your SSN",
    "confirm your card number",
    "confirm your CVV",
    "update your billing information",
    "upload a copy of your ID",
    "paypal support",
    "bank of america support",
    "chase online banking",
    "wells fargo online",
    "apple id account",
    "icloud account",
    "google account team",
    "microsoft account security",
    "office 365 admin",
    "dropbox support",
    "onedrive security",
    "amazon account support",
    "delivery failed",
    "click here to verify",
    "click here to login",
    "click here to restore access",
    "login securely here",
    "follow this link to confirm",
    "access your account using the link below",
    "download secure attachment",
    "open the attached form and complete it",
]

malicious_keywords = [
    "download and run",
    "run the attached file",
    "open the attachment and execute",
    "install the attached program",
    ".exe",
    ".scr",
    ".bat",
    ".cmd",
    ".vbs",
    ".js",
    "self-extracting archive",
    "portable executable",
    "full crack",
    "keygen",
    "serial key",
    "license key generator",
    "patch your software",
    "no activation required",
    "nulled version",
    "bypass activation",
    "bypass license",
    "system optimizer",
    "registry cleaner",
    "pc booster",
    "performance enhancer",
    "virus remover",
    "anti-malware tool",
    "install this update now",
    "critical security update",
]

harmful_keywords = [
    "bomb threat",
    "shoot",
    "shooting",
    "kill them",
    "eliminate them",
    "attack them",
    "knife attack",
    "massacre",
    "terror attack",
    "commit violence",
    "burn down",
    "blow up",
    "fake passport",
    "forged passport",
    "fake id card",
    "forged documents",
    "counterfeit money",
    "counterfeit currency",
    "buy stolen credit cards",
    "stolen bank account",
    "fullz",
    "sell your bank login",
    "money laundering",
    "clean your money",
    "drop address",
    "explicit racial slur",
    "explicit religious slur",
    "explicit homophobic slur",
    "they are not human",
    "they should be wiped out",
    "get rid of all of them",
    "they don’t belong here",
    "they should be removed",
]

spam_keywords = [
    "limited time offer",
    "act now",
    "don’t miss out",
    "while supplies last",
    "today only",
    "exclusive deal",
    "special promotion",
    "best price guaranteed",
    "lowest price ever",
    "incredible savings",
    "unbelievable offer",
    "you have been selected",
    "you have won",
    "winner of our lottery",
    "congratulations, you are a winner",
    "claim your prize",
    "claim your reward",
    "get rich quick",
    "earn money from home",
    "work from home and earn",
    "no experience necessary",
    "risk-free trial",
    "guaranteed profit",
    "online casino",
    "sports betting",
    "viagra / cialis / enhancement pills",
    "weight loss miracle",
    "fat burner",
    "detox cleanse",
    "crypto trading bot",
    "forex signal",
    "binary options",
]

solicited_keywords = [
    "invoice",
    "receipt",
    "order confirmation",
    "payment received",
    "payment reminder",
    "billing statement",
    "monthly statement",
    "transaction summary",
    "order number",
    "ticket number",
    "reference number",
    "appointment reminder",
    "appointment confirmation",
    "meeting scheduled",
    "calendar invite",
    "follow-up appointment",
    "lab results available",
    "shipping confirmation",
    "your package has shipped",
    "delivery scheduled",
    "tracking number",
    "thank you for contacting support",
    "we have received your request",
    "your case number is",
    "we are working on your request",
    "ticket has been updated",
    "please let us know if you have questions",
    "sincerely",
    "best regards",
    "kind regards",
]


In [3]:
# Cell 3 – Subject and body template helpers (updated)

import random
from pathlib import Path

IMAGE_LINKS = [
    "https://unsplash.com/photos/security-training-example?w=200",
    "https://www.pexels.com/photo/12345/?w=200",
    "https://pixabay.com/photos/id-12345/?w=200",
]

# Subject templates per category.
# {keyword} will be replaced with a keyword phrase.
SUBJECT_TEMPLATES = {
    "phishing": [
        "Security alert: {keyword}",
        "Important notice about your account – {keyword}",
        "Action required on your account: {keyword}",
        "Account notification: {keyword}",
        "Review needed: {keyword}",
    ],
    "malicious": [
        "Software update details – {keyword}",
        "Technical file notice: {keyword}",
        "System maintenance information – {keyword}",
        "Attached tool information – {keyword}",
        "Update package overview – {keyword}",
    ],
    "harmful": [
        "Training sample with threat phrase: {keyword}",
        "Content warning – example threat text: {keyword}",
        "Abuse-report sample containing phrase: {keyword}",
        "Safety filter training message – {keyword}",
        "Example of flagged language: {keyword}",
    ],
    "solicited": [
        "Your {keyword}",
        "Notification regarding your recent {keyword}",
        "Update on your {keyword}",
        "Service message: {keyword}",
        "Confirmation regarding your {keyword}",
    ],
    "spam": [
        "Special offer inside – {keyword}",
        "Limited promotion: {keyword}",
        "Marketing message: {keyword}",
        "Promo alert – {keyword}",
        "Deal notification: {keyword}",
    ],
}


def make_subject(category: str, keyword: str, index: int) -> str:
    """
    Build a unique, natural-looking subject line.

    Each subject uses a random template plus a short sample tag
    with a reference code so it is still clear this is training data.
    """
    if category not in SUBJECT_TEMPLATES:
        raise ValueError(f"Unknown category for subject: {category}")

    template = random.choice(SUBJECT_TEMPLATES[category])
    main_part = template.format(keyword=keyword)

    # Short tag at the end keeps it clearly a sample
    ref_tag = f" (Sample {category[:3].upper()}-{index:05d})"
    return main_part + ref_tag


def make_body(category: str, keyword: str, index: int) -> str:
    """Build the body text for one simulated email example."""
    link = random.choice(IMAGE_LINKS)

    intro = (
        f"This is a simulated {category} email example (#{index}) created only for "
        "security education and spam-filter research in 2024–2025. "
        "It is not a real message and must never be used to deceive anyone.\n\n"
        f"In real communication, people may misuse phrases like “{keyword}” to pressure, "
        "confuse, or mislead the reader. The text below mimics that style so detection "
        "systems can learn to recognize and filter it.\n\n"
    )

    if category == "phishing":
        middle = (
            "Dear Customer,\n\n"
            "We are writing to show how a suspicious sign-in or account notice can look. "
            f"A real scammer might include wording such as “{keyword}”, pretend to be a "
            "trusted brand, and try to push you into clicking a link or sharing private data.\n\n"
            "For this training sample, everything here is fake and for practice only. "
            "Do not treat it as a real request and never send similar text to anyone.\n\n"
        )
    elif category == "malicious":
        middle = (
            "Hello,\n\n"
            "This sample demonstrates how a message pushing harmful software might sound. "
            f"It could tell someone to “{keyword}” or to trust an attachment that claims to "
            "be a system tool or security update. In real life, those files can hide malware "
            "designed to steal data or damage a computer.\n\n"
            "Use this text only for defensive testing, never as a real instruction.\n\n"
        )
    elif category == "harmful":
        middle = (
            "Important Safety Notice (Training Sample Only)\n\n"
            "The lines in this example show how threatening or hateful language can appear in a message. "
            f"A genuine harmful email might include a phrase like “{keyword}”. "
            "This is shown here so filters can learn to detect and report such content quickly.\n\n"
            "Any real message with this kind of language should be taken seriously and reported to "
            "the proper authorities and support channels.\n\n"
        )
    elif category == "solicited":
        middle = (
            "Dear Customer,\n\n"
            "This is an example of a normal, solicited service email. "
            f"It may use a term such as “{keyword}” because the recipient started a purchase, "
            "booking, or support ticket. A legitimate notice explains the situation calmly and "
            "does not demand secret information or pressure you into rushed choices.\n\n"
        )
    else:  # spam
        middle = (
            "Hi there,\n\n"
            "This text shows the tone of a bulk advertising or spam message. "
            f"It leans on phrases like “{keyword}” to encourage quick clicks and sign-ups. "
            "Even when these offers look friendly, they often lead to low-quality sites or clutter "
            "a user’s inbox. Training on samples like this helps filters keep inboxes cleaner.\n\n"
        )

    link_line = (
        "Training link (safe placeholder to a stock-photo site):\n"
        f"[Open illustration]({link})\n\n"
    )

    closing = (
        "Reminder: this entire email is a controlled example for research and education. "
        "If you ever receive a similar real message, verify it through official channels and "
        "avoid clicking unexpected links or opening unknown attachments.\n"
    )

    return intro + middle + link_line + closing


In [4]:
# Cell 4 – Functions to generate and save the example emails

import random
from pathlib import Path

# Reuse OUTPUT_DIR from Cell 1 and keyword lists from Cell 2
try:
    OUTPUT_DIR
except NameError as exc:  # pragma: no cover
    raise RuntimeError("Run Cell 1 before this cell.") from exc


CATEGORY_KEYWORDS = {
    "phishing": phishing_keywords,
    "malicious": malicious_keywords,
    "harmful": harmful_keywords,
    "solicited": solicited_keywords,
    "spam": spam_keywords,
}


def generate_emails_for_category(
    category: str,
    n_min: int,
    n_max: int,
    output_dir: Path | None = None,
) -> int:
    """
    Generate a random number of simulated emails for a single category.

    Returns the number of files created.
    """
    if output_dir is None:
        output_dir = OUTPUT_DIR

    if category not in CATEGORY_KEYWORDS:
        raise ValueError(f"Unknown category: {category}")

    count = random.randint(n_min, n_max)
    keywords = CATEGORY_KEYWORDS[category]

    for idx in range(1, count + 1):
        keyword = random.choice(keywords)
        subject = make_subject(category, keyword, idx)
        body = make_body(category, keyword, idx)

        filename = output_dir / f"{category}_{idx:05d}.txt"
        with filename.open("w", encoding="utf-8") as f:
            f.write(f"Subject: {subject}\n\n")
            f.write(body)

    return count


def generate_all_email_examples(
    phishing_range=(10, 12),
    malicious_range=(10, 12),
    harmful_range=(10, 12),
    solicited_range=(10, 12),
    spam_range=(10, 12),
    output_dir: Path | None = None,
) -> dict:
    """
    Generate simulated emails for all categories.

    Default ranges are small for quick tests. For a big dataset you can
    pass ranges like (1675, 2000) for phishing, etc.
    """
    if output_dir is None:
        output_dir = OUTPUT_DIR

    # Ensure the folder exists and is empty before writing
    if output_dir.exists():
        for path in output_dir.glob("*.txt"):
            path.unlink()
    else:
        output_dir.mkdir(parents=True, exist_ok=True)

    results = {}
    results["phishing"] = generate_emails_for_category("phishing", *phishing_range, output_dir)
    results["malicious"] = generate_emails_for_category("malicious", *malicious_range, output_dir)
    results["harmful"] = generate_emails_for_category("harmful", *harmful_range, output_dir)
    results["solicited"] = generate_emails_for_category("solicited", *solicited_range, output_dir)
    results["spam"] = generate_emails_for_category("spam", *spam_range, output_dir)

    return results


In [5]:

from pathlib import Path

summary = generate_all_email_examples()
print("Files created per category:", summary)
print("All emails saved in:", Path(OUTPUT_DIR).resolve())

Files created per category: {'phishing': 11, 'malicious': 12, 'harmful': 12, 'solicited': 11, 'spam': 11}
All emails saved in: /Users/samueleko/Library/Mobile Documents/com~apple~CloudDocs/UNT/CSCE 5412/projects/spamharm/machine_learning_section/emails


In [None]:
summary = generate_all_email_examples(
    phishing_range=(1675, 2000),
    malicious_range=(2098, 3000),
    harmful_range=(1345, 2000),
    solicited_range=(1876, 2000),
    spam_range=(2312, 3000),
    
    # phishing_range=(16, 20),
    # malicious_range=(20, 30),
    # harmful_range=(13, 20),
    # solicited_range=(18, 20),
    # spam_range=(23, 30),
    
)
print(summary)


{'phishing': 20, 'malicious': 28, 'harmful': 14, 'solicited': 19, 'spam': 26}
