In [None]:
import pandas as pd
import streamlit as st
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import joblib
import os
import re



In [65]:
df = pd.read_csv("data/enron_spam_data.csv")
print("------------------------------first 5 rows------------------------------")
print(df.head())
print(df.tail())
print("------------------------------info about data------------------------------")
print(df.info())

print("------------------------------check null values------------------------------")
print(df.isnull().sum())

df['Message'] = df['Message'].fillna('')

print("------------------------------check duplicated rows------------------------------")
print(df.duplicated())
df = df.drop_duplicates()

df['Spam/Ham_numeric'] = df['Spam/Ham'].map({'ham': 1, 'spam': 0})

X = df['Message'] 
y = df['Spam/Ham_numeric']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



------------------------------first 5 rows------------------------------
   Message ID                       Subject  \
0           0  christmas tree farm pictures   
1           1      vastar resources , inc .   
2           2  calpine daily gas nomination   
3           3                    re : issue   
4           4     meter 7268 nov allocation   

                                             Message Spam/Ham        Date  
0                                                NaN      ham  1999-12-10  
1  gary , production from the high island larger ...      ham  1999-12-13  
2             - calpine daily gas nomination 1 . doc      ham  1999-12-14  
3  fyi - see note below - already done .\nstella\...      ham  1999-12-14  
4  fyi .\n- - - - - - - - - - - - - - - - - - - -...      ham  1999-12-14  
       Message ID                                            Subject  \
33711       33711  = ? iso - 8859 - 1 ? q ? good _ news _ c = eda...   
33712       33712  all prescript medicines a

In [56]:
# استخراج الميزات
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

print("Training features shape:", X_train_features.shape)
print("Testing features shape:", X_test_features.shape)

Training features shape: (26972, 139803)
Testing features shape: (6744, 139803)


In [57]:
# تدريب النموذج
model = MultinomialNB()
model.fit(X_train_features, y_train)

# التنبؤ والتقييم
y_pred = model.predict(X_test_features)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy = ", accuracy)

# طباعة تقرير التصنيف
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy =  0.9862099644128114

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3468
           1       0.99      0.98      0.99      3276

    accuracy                           0.99      6744
   macro avg       0.99      0.99      0.99      6744
weighted avg       0.99      0.99      0.99      6744



In [None]:
def check_email(email_text):
    # تحويل النص إلى نفس شكل الميزات المستخدمة في التدريب
    email_features = feature_extraction.transform([email_text])

    # التنبؤ
    prediction = model.predict(email_features)[0]
    probability = model.predict_proba(email_features)[0]
    
    # طباعة النتيجة
    if prediction == 1:
        print(f"This email address: regular mail (Ham)")
        print(f"Confidence percentage: {probability[1]*100:.2f}%")
    else:
        print(f"This email: Spam (Spam)")
        print(f"Confidence percentage: {probability[0]*100:.2f}%")

# أمثلة للاختبار
print("=== An example of regular mail  ===")
normal_email = "Hi, Can we schedule a meeting tomorrow at 2 PM to discuss the project progress? Thanks"
check_email(normal_email)

print("\n=== Spam example ===")
spam_email = "CONGRATULATIONS! You've won $1,000,000! Click here to claim your prize now! Limited time offer!!!"
check_email(spam_email)

print("\n=== Test your own mail ===")
# يمكنك تجربة بريدك الخاص هنا
your_email = input("Enter the text of the email: ")
check_email(your_email)

=== An example of regular mail  ===
This email address: regular mail (Ham)
Confidence percentage: 99.88%

=== Spam example ===
This email: Spam (Spam)
Confidence percentage: 95.31%

=== Test your own mail ===


This email: Spam (Spam)
Confidence percentage: 97.76%


In [None]:
print("Saving the model and feature extractor...")

joblib.dump(model, 'spam_classifier_model.pkl')
print("Model saved as 'spam_classifier_model.pkl'")

joblib.dump(feature_extraction, 'feature_extractor.pkl')
print("Feature extractor saved as 'feature_extractor.pkl'")



Saving the model and feature extractor...
Model saved as 'spam_classifier_model.pkl'
Feature extractor saved as 'feature_extractor.pkl'

To load the model and make predictions in another script, use:

import joblib

# Load the model and feature extractor


# Make predictions
def predict_spam(email_text):
    email_features = feature_extraction.transform([email_text])
    prediction = model.predict(email_features)[0]
    return "Ham" if prediction == 1 else "Spam"

