In [11]:
# Cell 1 – Imports

import sys
import json
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import joblib


In [12]:
# Cell 2 – Load FinTech scam dataset

df = pd.read_csv("fintech_scam_full.csv")

print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nLabel distribution (0=safe, 1=phishing):")
print(df["label_bin"].value_counts())

df.head()


Shape: (8987, 7)

Columns: ['text', 'label_bin', 'label_str', 'source', 'synthetic', 'category', 'is_financial']

Label distribution (0=safe, 1=phishing):
label_bin
0    5312
1    3675
Name: count, dtype: int64


Unnamed: 0,text,label_bin,label_str,source,synthetic,category,is_financial
0,\nHello I am your hot lil horny toy.\n I am...,1,Phishing,zefang_phishing_email_dataset,0,email,True
1,software at incredibly low prices ( 86 % lower...,1,Phishing,zefang_phishing_email_dataset,0,email,True
2,global risk management operations sally congra...,0,Safe,zefang_phishing_email_dataset,0,email,True
3,"entourage , stockmogul newsletter ralph velez ...",1,Phishing,zefang_phishing_email_dataset,0,email,True
4,"we owe you lots of money dear applicant , afte...",1,Phishing,zefang_phishing_email_dataset,0,email,True


In [13]:
# Cell 3 – Define behavioural lexicons & helpers

import re

URGENCY_WORDS = [
    "immediately", "urgent", "urgently", "now", "today", "right away",
    "act fast", "limited time", "last chance", "expire", "within 24 hours",
    "before the window closes"
]

AUTHORITY_WORDS = [
    "bank", "account department", "security team", "support team",
    "fraud department", "irs", "tax office", "government", "regulator",
    "card issuer", "administrator", "compliance", "verification team"
]

REWARD_WORDS = [
    "prize", "winner", "winnings", "jackpot", "bonus", "reward",
    "cash out", "payout", "profit", "high return", "guaranteed return",
    "investment opportunity", "zero risk returns"
]

FEAR_WORDS = [
    "suspend", "suspended", "blocked", "freeze", "closed",
    "unauthorized", "unusual activity", "breach", "compromised",
    "penalty", "fine", "legal action", "lawsuit", "debt", "overdue",
    "locked", "permanent suspension"
]

def score_keyword_group(text, keywords):
    """Return normalized score in [0,1] based on how many keywords appear."""
    t = str(text).lower()
    hits = sum(1 for w in keywords if w in t)
    return min(hits, 3) / 3.0  # cap at 3, then normalize

def extract_behavior_features(text):
    return {
        "urgency_score":   score_keyword_group(text, URGENCY_WORDS),
        "authority_score": score_keyword_group(text, AUTHORITY_WORDS),
        "reward_score":    score_keyword_group(text, REWARD_WORDS),
        "fear_score":      score_keyword_group(text, FEAR_WORDS),
    }

beh_df = df["text"].apply(extract_behavior_features).apply(pd.Series)
df_feat = pd.concat([df.reset_index(drop=True), beh_df.reset_index(drop=True)], axis=1)

print("After adding behavioural scores:", df_feat.shape)
df_feat.head()


After adding behavioural scores: (8987, 11)


Unnamed: 0,text,label_bin,label_str,source,synthetic,category,is_financial,urgency_score,authority_score,reward_score,fear_score
0,\nHello I am your hot lil horny toy.\n I am...,1,Phishing,zefang_phishing_email_dataset,0,email,True,0.0,0.0,0.0,0.0
1,software at incredibly low prices ( 86 % lower...,1,Phishing,zefang_phishing_email_dataset,0,email,True,0.0,0.0,0.0,0.0
2,global risk management operations sally congra...,0,Safe,zefang_phishing_email_dataset,0,email,True,0.333333,0.333333,0.0,0.0
3,"entourage , stockmogul newsletter ralph velez ...",1,Phishing,zefang_phishing_email_dataset,0,email,True,0.666667,1.0,0.333333,0.333333
4,"we owe you lots of money dear applicant , afte...",1,Phishing,zefang_phishing_email_dataset,0,email,True,0.0,0.0,0.0,0.0


In [14]:
# Cell 4 – Train/test split

X = df_feat["text"]
y = df_feat["label_bin"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])


Train size: 7189
Test size: 1798


In [15]:
# Cell 5 – Vectorizer + classifier training

vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=20000,
    ngram_range=(1, 2)
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec  = vectorizer.transform(X_test)

clf = LogisticRegression(
    max_iter=2000,
    n_jobs=-1
)

clf.fit(X_train_vec, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,2000


In [16]:
# Cell 6 – Evaluation report

y_pred = clf.predict(X_test_vec)

print(
    classification_report(
        y_test,
        y_pred,
        target_names=["Safe", "Phishing"]
    )
)


              precision    recall  f1-score   support

        Safe       0.98      0.99      0.98      1063
    Phishing       0.98      0.97      0.97       735

    accuracy                           0.98      1798
   macro avg       0.98      0.98      0.98      1798
weighted avg       0.98      0.98      0.98      1798



In [17]:
# Cell 7 – Build SERI helper table

# Probabilities for each class [prob_safe, prob_phishing]
y_proba_test = clf.predict_proba(X_test_vec)
p_scam = y_proba_test[:, 1]
max_prob = y_proba_test.max(axis=1)

seri_helper = pd.DataFrame({
    "text": X_test.values,
    "true_label": y_test.values,
    "p_scam": p_scam,
    "max_class_prob": max_prob
})

# merge behaviour scores for these same rows
seri_helper = seri_helper.merge(
    df_feat[["text", "urgency_score", "authority_score", "reward_score", "fear_score"]],
    on="text",
    how="left"
)

seri_helper.head()


Unnamed: 0,text,true_label,p_scam,max_class_prob,urgency_score,authority_score,reward_score,fear_score
0,re : thank you vince j kaminski @ ect 03 / 27 ...,0,0.01437,0.98563,0.0,0.333333,0.333333,0.0
1,"a practical matter, I doubt the value sold in ...",0,0.057166,0.942834,0.333333,0.333333,0.0,0.333333
2,fw : fsp milestone complete jeez this gets old...,0,0.10631,0.89369,0.333333,0.0,0.0,0.333333
3,"very urgent dear friend , firstly , not to cau...",1,0.917318,0.917318,1.0,0.333333,0.0,0.333333
4,enron research and ebs engineering and operati...,0,0.029767,0.970233,0.333333,0.0,0.0,0.0


In [18]:
# Cell 8 – SERI formula and risk bands

ALPHA = 0.35  # behavioural
BETA  = 0.00  # technical/network (0 for now)
GAMMA = 0.45  # classifier risk
DELTA = 0.20  # uncertainty

def compute_seri(row):
    R_b = (
        0.25 * row["urgency_score"] +
        0.25 * row["authority_score"] +
        0.25 * row["reward_score"] +
        0.25 * row["fear_score"]
    )
    R_t = 0.0              # no technical features yet
    R_c = row["p_scam"]
    U   = 1.0 - row["max_class_prob"]

    seri_raw = ALPHA * R_b + BETA * R_t + GAMMA * R_c + DELTA * U
    seri = 100 * min(1.0, max(0.0, seri_raw))
    return seri

seri_helper["SERI"] = seri_helper.apply(compute_seri, axis=1)

def risk_band(seri):
    if seri < 25:
        return "Low"
    elif seri < 50:
        return "Medium"
    elif seri < 75:
        return "High"
    else:
        return "Critical"

seri_helper["risk_band"] = seri_helper["SERI"].apply(risk_band)

print(seri_helper[["SERI", "risk_band"]].head())
print("\nRisk band counts:")
print(seri_helper["risk_band"].value_counts())


        SERI risk_band
0   6.767395       Low
1  12.465821       Low
2  12.743467       Low
3  57.516287      High
4   4.851501       Low

Risk band counts:
risk_band
Low       1128
Medium     764
High       229
Name: count, dtype: int64


In [19]:
# Cell 9 – Save model, vectorizer, lexicons & weights

# 1) Model + vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.joblib")
joblib.dump(clf, "phishing_classifier.joblib")

# 2) Behavioural lexicons + weights for SERI
lexicons = {
    "URGENCY_WORDS": URGENCY_WORDS,
    "AUTHORITY_WORDS": AUTHORITY_WORDS,
    "REWARD_WORDS": REWARD_WORDS,
    "FEAR_WORDS": FEAR_WORDS,
}

weights = {
    "ALPHA": ALPHA,
    "BETA": BETA,
    "GAMMA": GAMMA,
    "DELTA": DELTA,
}

with open("seri_lexicons.json", "w") as f:
    json.dump(lexicons, f, indent=2)

with open("seri_weights.json", "w") as f:
    json.dump(weights, f, indent=2)

print("Saved tfidf_vectorizer.joblib, phishing_classifier.joblib, seri_lexicons.json, seri_weights.json")


Saved tfidf_vectorizer.joblib, phishing_classifier.joblib, seri_lexicons.json, seri_weights.json


In [20]:
# Cell 10 – Reload everything to verify

v = joblib.load("tfidf_vectorizer.joblib")
m = joblib.load("phishing_classifier.joblib")

with open("seri_lexicons.json") as f:
    lx = json.load(f)

print("Vectorizer type:", type(v))
print("Model type:", type(m))
print("Lexicon keys:", list(lx.keys()))


Vectorizer type: <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
Model type: <class 'sklearn.linear_model._logistic.LogisticRegression'>
Lexicon keys: ['URGENCY_WORDS', 'AUTHORITY_WORDS', 'REWARD_WORDS', 'FEAR_WORDS']


In [21]:
import joblib

v = joblib.load("tfidf_vectorizer.joblib")
print("Has idf_ attribute?", hasattr(v, "idf_"))


Has idf_ attribute? True
