In [1]:
### Cell 1 – Install and import core libraries

import sys

# Install only once; if already installed you can skip / comment this
!{sys.executable} -m pip install datasets pandas --quiet

import pandas as pd
from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### Cell 2 – Finance keyword filter

FINANCE_KEYWORDS = [
    "bank", "account", "credit card", "debit card", "card ending",
    "card number", "atm", "statement", "transaction", "transfer",
    "wire transfer", "wire", "refund", "chargeback", "payment",
    "invoice", "bill", "billing", "loan", "mortgage", "overdraft",
    "investment", "returns", "dividend", "tax", "irs", "revenue service",
    "crypto", "bitcoin", "wallet", "exchange", "balance", "overdrawn",
    "fee", "interest", "limit", "cashback", "cash advance"
]

def is_financial_text(text: str) -> bool:
    """Rough heuristic: does the text mention any finance-related word?"""
    t = str(text).lower()
    return any(kw in t for kw in FINANCE_KEYWORDS)


In [3]:
### Cell 3 – Real FinTech emails from zefang-liu/phishing-email-dataset

hf_phish_ds = load_dataset("zefang-liu/phishing-email-dataset")

raw_df = hf_phish_ds["train"].to_pandas()
print("Original HF phishing dataset:", raw_df.shape)
raw_df.head()

# Standardize column names
hf_df = raw_df.rename(columns={
    "Email Text": "text",
    "Email Type": "email_type"
})

# Map labels: Phishing Email -> 1, Safe Email -> 0
label_map = {
    "Phishing Email": 1,
    "Safe Email": 0
}
hf_df["label_bin"] = hf_df["email_type"].map(label_map)

# Drop rows with unknown labels
hf_df = hf_df.dropna(subset=["label_bin"])

# Keep only clearly financial-looking emails
hf_df["is_financial"] = hf_df["text"].apply(is_financial_text)
fin_hf_df = hf_df[hf_df["is_financial"] == True].copy()

fin_hf_df["label_bin"] = fin_hf_df["label_bin"].astype(int)
fin_hf_df["label_str"] = fin_hf_df["label_bin"].map({0: "Safe", 1: "Phishing"})
fin_hf_df["source"] = "zefang_phishing_email_dataset"
fin_hf_df["synthetic"] = 0
fin_hf_df["category"] = "email"   # generic email

print("FinTech subset from HF phishing dataset:", fin_hf_df.shape)
fin_hf_df[["text", "label_str"]].head(10)


Original HF phishing dataset: (18650, 3)
FinTech subset from HF phishing dataset: (8733, 9)


Unnamed: 0,text,label_str
3,\nHello I am your hot lil horny toy.\n I am...,Phishing
4,software at incredibly low prices ( 86 % lower...,Phishing
5,global risk management operations sally congra...,Safe
7,"entourage , stockmogul newsletter ralph velez ...",Phishing
8,"we owe you lots of money dear applicant , afte...",Phishing
9,re : coastal deal - with exxon participation u...,Safe
10,make her beg you to give it to her everynight ...,Phishing
12,"begin forwarded text Date: Wed, 25 Sep 2002 13...",Safe
14,rmmla / ads * * * * * * * * papers solicited f...,Safe
17,"re : 3 . 402 queries : language - speakers , s...",Safe


In [4]:
### Cell 4 – FinTech-related social engineering conversations

from datasets import load_dataset

se_ds = load_dataset("Ngadou/social-engineering-convo")

se_df = se_ds["train"].to_pandas()
print("Original social-engineering-convo:", se_df.shape)
se_df.head()

# Standardize names
se_df = se_df.rename(columns={
    "Conversation": "text",
    "Classification": "classification"
})

# Map scam/not-scam → 1/0
se_label_map = {
    "Scam": 1,
    "Likely a Scam": 1,
    "Not a Scam": 0
}
se_df["label_bin"] = se_df["classification"].map(se_label_map)
se_df = se_df.dropna(subset=["label_bin"])

# Only keep convos that mention money / finance
se_df["is_financial"] = se_df["text"].apply(is_financial_text)
fin_se_df = se_df[se_df["is_financial"] == True].copy()

fin_se_df["label_bin"] = fin_se_df["label_bin"].astype(int)
fin_se_df["label_str"] = fin_se_df["label_bin"].map({0: "Safe", 1: "Phishing"})
fin_se_df["source"] = "social_engineering_convo"
fin_se_df["synthetic"] = 0
fin_se_df["category"] = "chat"

print("FinTech subset from social-engineering-convo:", fin_se_df.shape)
fin_se_df[["text", "label_str"]].head(10)


Original social-engineering-convo: (10, 3)
FinTech subset from social-engineering-convo: (4, 9)


Unnamed: 0,text,label_str
0,"Person 1: 'Hello, we noticed unusual activity ...",Phishing
2,Person 1: 'Congratulations! You have won a lot...,Phishing
4,"Person 1: 'Hello, we're calling from the Tax O...",Phishing
6,"Person 1: 'Hey, it's me. I'm stuck overseas an...",Phishing


In [5]:
### Cell 5 – Synthetic AI-generated FinTech emails (templates, not API)

import random
random.seed(42)

BANK_NAMES = [
    "GlobalTrust Bank", "UnionFirst Credit", "Metro Savings", "BlueRiver Bank",
    "SecureLine Financial", "NorthPoint Credit Union"
]

INVEST_PRODUCTS = [
    "crypto index fund", "high-yield bond portfolio",
    "AI-driven trading account", "real-estate token offering"
]

CURRENCIES = ["USD", "EUR", "GBP"]
CARD_TYPES = ["credit card", "debit card", "business card"]

def make_scam_email(kind: str) -> str:
    bank = random.choice(BANK_NAMES)
    amount = random.randint(120, 4800)
    curr = random.choice(CURRENCIES)

    if kind == "card_lock":
        return (
            f"URGENT: Your {random.choice(CARD_TYPES)} at {bank} has been temporarily locked "
            f"due to suspicious transactions of {amount} {curr}. "
            "Verify your card number and CVV immediately using the secure link below "
            "to avoid permanent suspension."
        )
    elif kind == "investment":
        return (
            f"Our {bank} private desk is offering a limited {random.choice(INVEST_PRODUCTS)} "
            "promising up to 35% monthly returns with zero risk. "
            "To reserve your allocation, confirm your bank account and send a small verification transfer today."
        )
    elif kind == "tax_refund":
        return (
            f"Internal Revenue Service: Our records show an unclaimed tax refund of {amount} {curr}. "
            "To receive your payment, submit your bank login and routing information in the attached secure form."
        )
    elif kind == "loan_approval":
        return (
            f"Good news! Your instant loan for {amount*10} {curr} has been approved with a special 0.9% rate. "
            "To release the funds today, pay the one-time processing fee using the bank details in the attached invoice."
        )
    elif kind == "crypto_wallet":
        return (
            "Security Alert: Unauthorised withdrawal attempt detected from your crypto wallet. "
            "To restore your balance, reconnect your wallet and re-enter your seed phrase on our verification portal."
        )
    else:  # generic
        return (
            f"Dear customer, we noticed irregular online banking activity on your {bank} account. "
            "For your protection, confirm your identity by replying with your full name, date of birth, "
            "and the 3-digit security code from the back of your card."
        )

def make_safe_email(kind: str) -> str:
    bank = random.choice(BANK_NAMES)
    if kind == "statement":
        return (
            f"Your {bank} monthly account statement is now available. "
            "Please log in using the official mobile app or bookmarked website. "
            "For your security, we will never ask for your PIN or password by email."
        )
    elif kind == "spend_summary":
        return (
            f"Hi, here is your weekly spending summary from {bank}. "
            "You can review categories and set budgeting goals in the app. "
            "You do not need to reply or share any personal details."
        )
    else:
        return (
            f"This is a reminder that your savings account at {bank} will earn a higher interest rate next month. "
            "No action is required from you. If you have questions, call the number printed on the back of your card."
        )

synthetic_rows = []

# Synthetic phishing emails
scam_kinds = ["card_lock", "investment", "tax_refund", "loan_approval", "crypto_wallet", "generic"]
for i in range(150):
    k = random.choice(scam_kinds)
    txt = make_scam_email(k)
    synthetic_rows.append({
        "text": txt,
        "label_bin": 1,
        "label_str": "Phishing",
        "source": "synthetic_ai",
        "synthetic": 1,
        "category": k,
        "is_financial": True
    })

# Synthetic safe financial emails
safe_kinds = ["statement", "spend_summary", "generic"]
for i in range(100):
    k = random.choice(safe_kinds)
    txt = make_safe_email(k)
    synthetic_rows.append({
        "text": txt,
        "label_bin": 0,
        "label_str": "Safe",
        "source": "synthetic_ai",
        "synthetic": 1,
        "category": "safe_" + k,
        "is_financial": True
    })

synthetic_df = pd.DataFrame(synthetic_rows)
print("Synthetic dataset shape:", synthetic_df.shape)
synthetic_df.head()


Synthetic dataset shape: (250, 7)


Unnamed: 0,text,label_bin,label_str,source,synthetic,category,is_financial
0,"Dear customer, we noticed irregular online ban...",1,Phishing,synthetic_ai,1,generic,True
1,Internal Revenue Service: Our records show an ...,1,Phishing,synthetic_ai,1,tax_refund,True
2,"Dear customer, we noticed irregular online ban...",1,Phishing,synthetic_ai,1,generic,True
3,Security Alert: Unauthorised withdrawal attemp...,1,Phishing,synthetic_ai,1,crypto_wallet,True
4,URGENT: Your business card at UnionFirst Credi...,1,Phishing,synthetic_ai,1,card_lock,True


In [6]:
### Cell 6 – Combine everything, keep FULL dataset, plus a 500-row sample

cols_common = ["text", "label_bin", "label_str", "source", "synthetic", "category", "is_financial"]

fin_hf_small = fin_hf_df.copy()
if "category" not in fin_hf_small.columns:
    fin_hf_small["category"] = "email"
if "is_financial" not in fin_hf_small.columns:
    fin_hf_small["is_financial"] = True

fin_se_small = fin_se_df.copy()
if "category" not in fin_se_small.columns:
    fin_se_small["category"] = "chat"
if "is_financial" not in fin_se_small.columns:
    fin_se_small["is_financial"] = True

combined_df = pd.concat(
    [
        fin_hf_small[cols_common],
        fin_se_small[cols_common],
        synthetic_df[cols_common],
    ],
    ignore_index=True
)

print("FULL combined dataset:", combined_df.shape)
print("Label distribution (0=safe, 1=phishing):")
print(combined_df["label_bin"].value_counts())

# Save FULL dataset
combined_df.to_csv("fintech_scam_full.csv", index=False)
print("\nSaved fintech_scam_full.csv")

# Also create a 500-row sample for quick experiments (optional)
if len(combined_df) >= 500:
    fintech_500 = combined_df.sample(n=500, random_state=42).reset_index(drop=True)
else:
    fintech_500 = combined_df.reset_index(drop=True)

print("\nSampled dataset shape:", fintech_500.shape)
print("Sample label distribution:")
print(fintech_500["label_bin"].value_counts())

fintech_500.to_csv("fintech_scam_500.csv", index=False)
print("Saved fintech_scam_500.csv")

fintech_500.head()


FULL combined dataset: (8987, 7)
Label distribution (0=safe, 1=phishing):
label_bin
0    5312
1    3675
Name: count, dtype: int64

Saved fintech_scam_full.csv

Sampled dataset shape: (500, 7)
Sample label distribution:
label_bin
0    313
1    187
Name: count, dtype: int64
Saved fintech_scam_500.csv


Unnamed: 0,text,label_bin,label_str,source,synthetic,category,is_financial
0,summary of ' bubbler ' a few months ago i post...,0,Safe,zefang_phishing_email_dataset,0,email,True
1,lectureship in linguistics s c h o o l o f e n...,0,Safe,zefang_phishing_email_dataset,0,email,True
2,coling - acl ' 98 workshops cfps below are two...,0,Safe,zefang_phishing_email_dataset,0,email,True
3,"paliourg , multiple lenders for best home loan...",1,Phishing,zefang_phishing_email_dataset,0,email,True
4,hot penny pick fueled by high demand energy te...,1,Phishing,zefang_phishing_email_dataset,0,email,True
