In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.utils import class_weight



In [None]:
from google.colab import files

# Open a file upload dialog
uploaded = files.upload()

# Load the uploaded CSV into a pandas DataFrame
file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name)

Saving mail_data.csv to mail_data (2).csv


In [None]:
if 'subject' not in df.columns:
    # try lowercase alternatives
    for c in df.columns:
        if c.lower() == 'subject':
            df.rename(columns={c:'subject'}, inplace=True)
if 'body' not in df.columns:
    for c in df.columns:
        if c.lower() in ['body','message','text']:
            df.rename(columns={c:'body'}, inplace=True)
if 'label' not in df.columns:
    for c in df.columns:
        if c.lower() in ['label','class','category','target']:
            df.rename(columns={c:'label'}, inplace=True)

# Fill NaNs
df['subject'] = df.get('subject', pd.Series([""]*len(df))).fillna("").astype(str)
df['body'] = df.get('body', pd.Series([""]*len(df))).fillna("").astype(str)

df['text'] = (df['subject'].str.strip() + " " + df['body'].str.strip()).str.strip()

# Inspect labels
print("Label values sample:", df['label'].unique()[:20])

Label values sample: ['ham' 'spam']


In [None]:
le = LabelEncoder()
y = le.fit_transform(df['label'].astype(str))
print("Classes (label encoder):", le.classes_)

Classes (label encoder): ['ham' 'spam']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
tfidf = TfidfVectorizer(
    max_features=40000,
    ngram_range=(1,2),
    lowercase=True,
    stop_words='english'   # optional, remove or change as needed
)

In [None]:
pipelines = {
    "nb": Pipeline([("tfidf", tfidf), ("clf", MultinomialNB())]),
    "lr": Pipeline([("tfidf", tfidf), ("clf", LogisticRegression(max_iter=2000, n_jobs=-1))]),
    "rf": Pipeline([("tfidf", tfidf), ("clf", RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42))])
}

In [None]:
results = {}
for name, pipe in pipelines.items():
    print(f"\nTraining: {name}")
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy ({name}): {acc:.4f}")
    print(classification_report(y_test, y_pred, target_names=list(le.classes_)))
    results[name] = {"pipe": pipe, "acc": acc}



Training: nb
Accuracy (nb): 0.9614
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       1.00      0.71      0.83       149

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.90      1115
weighted avg       0.96      0.96      0.96      1115


Training: lr
Accuracy (lr): 0.9596
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       1.00      0.70      0.82       149

    accuracy                           0.96      1115
   macro avg       0.98      0.85      0.90      1115
weighted avg       0.96      0.96      0.96      1115


Training: rf
Accuracy (rf): 0.9713
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       1.00      0.79      0.88       149

    accuracy                           0.97      1115
   macro avg       0

In [None]:
best_name = max(results.keys(), key=lambda k: results[k]["acc"])
best_pipe = results[best_name]["pipe"]
print(f"\nBest pipeline: {best_name} with accuracy {results[best_name]['acc']:.4f}")


Best pipeline: rf with accuracy 0.9713


In [None]:
joblib.dump({"pipeline": best_pipe, "label_encoder": le}, "email_phishing_model.joblib")
print("Saved model as: email_phishing_model.joblib")

Saved model as: email_phishing_model.joblib


In [None]:
from google.colab import files
files.download("email_phishing_model.joblib")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>