In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer  # Use TF-IDF instead of CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

In [4]:
# Load data
df = pd.read_csv('C:/Users/D.Sathiya Pandi/Downloads/CEAS_08.csv')
print(df.head())

                                              sender  \
0                   Young Esposito <Young@iworld.de>   
1                       Mok <ipline's1983@icable.ph>   
2  Daily Top 10 <Karmandeep-opengevl@universalnet...   
3                 Michael Parker <ivqrnai@pobox.com>   
4  Gretchen Suggs <externalsep1@loanofficertool.com>   

                                         receiver  \
0                     user4@gvc.ceas-challenge.cc   
1                   user2.2@gvc.ceas-challenge.cc   
2                   user2.9@gvc.ceas-challenge.cc   
3  SpamAssassin Dev <xrh@spamassassin.apache.org>   
4                   user2.2@gvc.ceas-challenge.cc   

                              date  \
0  Tue, 05 Aug 2008 16:31:02 -0700   
1  Tue, 05 Aug 2008 18:31:03 -0500   
2  Tue, 05 Aug 2008 20:28:00 -1200   
3  Tue, 05 Aug 2008 17:31:20 -0600   
4  Tue, 05 Aug 2008 19:31:21 -0400   

                                             subject  \
0                          Never agree to be a loser   
1  

In [5]:
df = df.drop_duplicates()
df = df.dropna(subset=['body', 'label'])

In [8]:
# Cleaning text function
def clean_text(text):
    return str(text).lower()

df['body_clean'] = df['body'].apply(clean_text)

# Replace CountVectorizer with TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')

X = vectorizer.fit_transform(df['body_clean'])

# Label encoding
df['label'] = df['label'].astype(str).str.strip().str.lower()
label_mapping = {
    '0': 0, 'legitimate': 0, 'ham': 0,
    '1': 1, 'spam': 1,
    '2': 2, 'phishing': 2
}
y = df['label'].map(label_mapping)

mask = ~y.isna()
X = X[mask]
y = y[mask].astype(int)



In [9]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train Naive Bayes with TF-IDF features
model = MultinomialNB()
model.fit(X_train, y_train)

# Save model and vectorizer for deployment
joblib.dump(model, 'model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

print("✅ Model and TF-IDF Vectorizer saved successfully!")

✅ Model and TF-IDF Vectorizer saved successfully!


In [10]:
# Evaluate
y_pred = model.predict(X_test)
print("Model evaluation results:")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall   :", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score :", f1_score(y_test, y_pred, average='weighted'))

Model evaluation results:
Accuracy : 0.9916996552164474
Precision: 0.9917820998465352
Recall   : 0.9916996552164474
F1 Score : 0.9917048243498922
