In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

In [11]:
# Load CEAS_08 dataset CSV
df = pd.read_csv('C:/Users/D.Sathiya Pandi/Downloads/CEAS_08.csv')
print(df.head())

                                              sender  \
0                   Young Esposito <Young@iworld.de>   
1                       Mok <ipline's1983@icable.ph>   
2  Daily Top 10 <Karmandeep-opengevl@universalnet...   
3                 Michael Parker <ivqrnai@pobox.com>   
4  Gretchen Suggs <externalsep1@loanofficertool.com>   

                                         receiver  \
0                     user4@gvc.ceas-challenge.cc   
1                   user2.2@gvc.ceas-challenge.cc   
2                   user2.9@gvc.ceas-challenge.cc   
3  SpamAssassin Dev <xrh@spamassassin.apache.org>   
4                   user2.2@gvc.ceas-challenge.cc   

                              date  \
0  Tue, 05 Aug 2008 16:31:02 -0700   
1  Tue, 05 Aug 2008 18:31:03 -0500   
2  Tue, 05 Aug 2008 20:28:00 -1200   
3  Tue, 05 Aug 2008 17:31:20 -0600   
4  Tue, 05 Aug 2008 19:31:21 -0400   

                                             subject  \
0                          Never agree to be a loser   
1  

In [12]:
# Drop duplicates & missing values in key columns (body and label)
df = df.drop_duplicates()
df = df.dropna(subset=['body', 'label'])

In [13]:
# Basic text cleaning function (can be enhanced for CEAS_08 specifics)
def clean_text(text):
    # Lowercase, strip, remove extra spaces, could add URL extraction if needed
    return str(text).lower().strip()

df['body_clean'] = df['body'].apply(clean_text)

# TF-IDF vectorizer with parameters suitable for CEAS_08 text distributions
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=10000,        # limit features for efficiency
    ngram_range=(1, 2),        # consider unigrams and bigrams for better context
    max_df=0.9,                # ignore terms in more than 90% documents (common words)
    min_df=5                   # ignore very rare terms (appearing < 5 docs)
)

X = vectorizer.fit_transform(df['body_clean'])

# Map dataset labels: CEAS_08 may have '0'/'1'/'2' or text; adjust accordingly
label_mapping = {
    '0': 0, 'legitimate': 0, 'ham': 0,
    '1': 1, 'spam': 1,
    '2': 2, 'phishing': 2
}

df['label'] = df['label'].astype(str).str.strip().str.lower()
y = df['label'].map(label_mapping)

# Filter out unlabeled or unknown mask
mask = ~y.isna()
X = X[mask]
y = y[mask].astype(int)


In [15]:
# Split data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train the Naive Bayes model (MultinomialNB suited for discrete features)
model = MultinomialNB(alpha=1.0)  # smoothing parameter alpha can be tuned

model.fit(X_train, y_train)

# Save model and vectorizer for deployment
joblib.dump(model, 'model_ceas08.pkl')
joblib.dump(vectorizer, 'vectorizer_ceas08.pkl')

print("✅ CEAS_08 Model and Vectorizer saved!")


✅ CEAS_08 Model and Vectorizer saved!


In [16]:
# Evaluate model performance on test set
y_pred = model.predict(X_test)

print("=== CEAS_08 Model Evaluation ===")
print(f"Accuracy : {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall   : {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score : {f1_score(y_test, y_pred, average='weighted'):.4f}")

=== CEAS_08 Model Evaluation ===
Accuracy : 0.9780
Precision: 0.9789
Recall   : 0.9780
F1 Score : 0.9781


In [None]:
%%writefile .py
import streamlit as st
import joblib

# Load CEAS_08-trained model & vectorizer
model = joblib.load('model_ceas08.pkl')
vectorizer = joblib.load('vectorizer_ceas08.pkl')

label_map = {0: "Legitimate", 1: "Spam", 2: "Phishing"}

st.title("📧 CEAS_08 Email Spam & Phishing Detector")

email_text = st.text_area("Paste your email content here:")

if st.button("Predict"):
    if not email_text.strip():
        st.warning("Please enter email content to predict.")
    else:
        # Preprocess & vectorize input
        X_input = vectorizer.transform([email_text.lower().strip()])

        # Predict with the loaded model
        pred = model.predict(X_input)[0]

        # Display result
        st.success(f"Prediction: {label_map[pred]}")
