In [38]:
import string
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix


In [39]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/samardeepbhamra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [40]:
df = pd.read_csv('spam_ham_dataset.csv')

In [41]:
def preprocess_email(text):
    text = text.lower()  # Lowercase the text
    text = ''.join([char for char in text if char not in string.punctuation])  # Remove punctuation
    words = text.split()  # Tokenize into words
    stopwords_set = set(stopwords.words('english'))  # Set of stopwords
    stemmer = PorterStemmer()  # Initialize stemmer
    # Remove stopwords and apply stemming
    email_text_cleaned = [stemmer.stem(word) for word in words if word not in stopwords_set]
    return ' '.join(email_text_cleaned)  # Return processed text


In [42]:
df['cleaned_text'] = df['text'].apply(preprocess_email)

In [43]:
vectorizer = CountVectorizer(max_features=1000)  # Limit features for simplicity
X = vectorizer.fit_transform(df['cleaned_text'])  # Vectorized emails
y = df['label_num']  # Labels: 0 for ham, 1 for spam \


In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [46]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

In [47]:
clf.score(X_test, y_test)

0.9742268041237113

In [48]:
y_pred = clf.predict(X_test)

In [49]:
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      1121
           1       0.96      0.95      0.95       431

    accuracy                           0.97      1552
   macro avg       0.97      0.97      0.97      1552
weighted avg       0.97      0.97      0.97      1552

Confusion Matrix:
 [[1103   18]
 [  22  409]]


In [57]:
# Step 9: Predict on an example from the dataset
def predict_existing_email(index):
    # Extract the original email text from the dataset
    original_email = df['text'].iloc[index]
    
    # Preprocess the email
    email_text_processed = preprocess_email(original_email)
    
    # Vectorize the processed email
    email_text_vectorized = vectorizer.transform([email_text_processed])
    
    # Predict using the trained model
    prediction = clf.predict(email_text_vectorized)
    
    # Return the original email, processed email, and the prediction (spam or ham)
    result = "spam" if prediction == 1 else "ham"
    return original_email, email_text_processed, result

# Example: Check the 10th email in the dataset
index = 0  # You can change this index to check other emails
original_email, processed_email, result = predict_existing_email(index)
print(f"Original Email:\n{original_email}\n")
print(f"Processed Email:\n{processed_email}\n")
print(f"Prediction: {result}")



Original Email:
Subject: enron methanol ; meter # : 988291
this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary
flow data provided by daren } .
please override pop ' s daily volume { presently zero } to reflect daily
activity you can obtain from gas control .
this change is needed asap for economics purposes .

Processed Email:
subject enron methanol meter 988291 follow note gave monday 4 3 00 preliminari flow data provid daren pleas overrid pop daili volum present zero reflect daili activ obtain ga control chang need asap econom purpos

Prediction: ham


In [56]:
df

Unnamed: 0.1,Unnamed: 0,label,text,label_num,cleaned_text
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,subject enron methanol meter 988291 follow not...
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,subject hpl nom januari 9 2001 see attach file...
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,subject neon retreat ho ho ho around wonder ti...
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1,subject photoshop window offic cheap main tren...
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0,subject indian spring deal book teco pvr reven...
...,...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0,subject put 10 ft transport volum decreas 2500...
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0,subject 3 4 2000 follow nom hpl take extra 15 ...
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0,subject calpin daili ga nomin juli mention ear...
5169,1409,ham,Subject: industrial worksheets for august 2000...,0,subject industri worksheet august 2000 activ a...
