In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Load data
df = pd.read_csv('C:/Users/D.Sathiya Pandi/Downloads/CEAS_08.csv')
print(df.head())

                                              sender  \
0                   Young Esposito <Young@iworld.de>   
1                       Mok <ipline's1983@icable.ph>   
2  Daily Top 10 <Karmandeep-opengevl@universalnet...   
3                 Michael Parker <ivqrnai@pobox.com>   
4  Gretchen Suggs <externalsep1@loanofficertool.com>   

                                         receiver  \
0                     user4@gvc.ceas-challenge.cc   
1                   user2.2@gvc.ceas-challenge.cc   
2                   user2.9@gvc.ceas-challenge.cc   
3  SpamAssassin Dev <xrh@spamassassin.apache.org>   
4                   user2.2@gvc.ceas-challenge.cc   

                              date  \
0  Tue, 05 Aug 2008 16:31:02 -0700   
1  Tue, 05 Aug 2008 18:31:03 -0500   
2  Tue, 05 Aug 2008 20:28:00 -1200   
3  Tue, 05 Aug 2008 17:31:20 -0600   
4  Tue, 05 Aug 2008 19:31:21 -0400   

                                             subject  \
0                          Never agree to be a loser   
1  

In [7]:
# Basic cleaning
df = df.drop_duplicates()
df = df.dropna(subset=['body', 'label'])  # Keep only rows with both text and label

# Simple text preprocessing function
def clean_text(text):
    return str(text).lower()

# Apply cleaning to email body
df['body_clean'] = df['body'].apply(clean_text)

# Vectorization for Naive Bayes (Bag-of-Words)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['body_clean'])

# --- Label Encoding ---
# 1. Normalize to lowercase strings first
df['label'] = df['label'].astype(str).str.strip().str.lower()

# 2. Mapping that handles text and numeric strings
label_mapping = {
    '0': 0, 'legitimate': 0, 'ham': 0,
    '1': 1, 'spam': 1,
    '2': 2, 'phishing': 2
}

y = df['label'].map(label_mapping)

# 3. Drop any unmapped labels
mask = ~y.isna()
X = X[mask]
y = y[mask].astype(int)  # Ensure integer type

print("Unique encoded labels:", y.unique())
print("Any NaN in y?", y.isna().any())

Unique encoded labels: [1 0]
Any NaN in y? False


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import joblib

# Assuming X, y, and vectorizer are already created from preprocessing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train Naive Bayes
model = MultinomialNB()
model.fit(X_train, y_train)

# Save both model and vectorizer
joblib.dump(model, 'model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

print("✅ Model & Vectorizer saved as model.pkl and vectorizer.pkl")


✅ Model & Vectorizer saved as model.pkl and vectorizer.pkl


In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predict on the test set
y_pred = model.predict(X_test)

# Print evaluation metrics
print("📊 Model Evaluation Results:")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall   :", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score :", f1_score(y_test, y_pred, average='weighted'))

📊 Model Evaluation Results:
Accuracy : 0.960413740263057
Precision: 0.9635261052157661
Recall   : 0.960413740263057
F1 Score : 0.9605216088741081


In [18]:
%%writefile detection.py
import streamlit as st
import joblib

# Load saved model & vectorizer
model = joblib.load('model.pkl')
vectorizer = joblib.load('vectorizer.pkl')

label_map = {0: 'Legitimate', 1: 'Spam', 2: 'Phishing'}

st.title("📧 Email Spam & Phishing Detector")

email_text = st.text_area("Paste your email content here:")
if st.button("Predict"):
    X_input = vectorizer.transform([email_text.lower()])
    pred = model.predict(X_input)[0]
    st.success(f"Prediction: {label_map[pred]}")


Writing detection.py
