In [1]:
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Function to load the dataset
def load_dataset(folder_path):
    ham_folder = os.path.join(folder_path, "ham")
    spam_folder = os.path.join(folder_path, "spam")
    ham_files = [os.path.join(ham_folder, filename) for filename in os.listdir(ham_folder)]
    spam_files = [os.path.join(spam_folder, filename) for filename in os.listdir(spam_folder)]
    emails = []
    labels = []
    for file_path in ham_files:
        with open(file_path, 'r', encoding='latin1') as file:
            emails.append(file.read())
            labels.append(0)  # 0 for ham
    for file_path in spam_files:
        with open(file_path, 'r', encoding='latin1') as file:
            emails.append(file.read())
            labels.append(1)  # 1 for spam
    return emails, labels


In [5]:
import os
import zipfile

def extract_enron_zip(zip_file_path, extract_to):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

def load_dataset(folder_path):
    ham_folder = os.path.join(folder_path, "ham")
    spam_folder = os.path.join(folder_path, "spam")
    ham_files = [os.path.join(ham_folder, filename) for filename in os.listdir(ham_folder)]
    spam_files = [os.path.join(spam_folder, filename) for filename in os.listdir(spam_folder)]
    emails = []
    labels = []
    for filename in ham_files:
        with open(filename, "r", encoding="latin-1") as file:
            emails.append(file.read())
            labels.append(0)  # 0 represents "ham"
    for filename in spam_files:
        with open(filename, "r", encoding="latin-1") as file:
            emails.append(file.read())
            labels.append(1)  # 1 represents "spam"
    return emails, labels

# Step 1: Extract the Enron dataset from the zip file
zip_file_path = r"C:\Users\Shalma\Downloads\enron1.zip"
extract_to = r"C:\Users\Shalma\project documents"
extract_enron_zip(zip_file_path, extract_to)

# Step 2: Load the Dataset
folder_path = os.path.join(extract_to, "enron1")
emails, labels = load_dataset(folder_path)


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Step 3: Preprocess the Dataset
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(emails)
y = labels

# Step 4: Split the Dataset into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Select the Appropriate Classifier
classifier = MultinomialNB()

# Step 6: Train the Model
classifier.fit(X_train, y_train)

# Step 7: Evaluate the Model
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.9826086956521739
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       749
           1       0.97      0.97      0.97       286

    accuracy                           0.98      1035
   macro avg       0.98      0.98      0.98      1035
weighted avg       0.98      0.98      0.98      1035



In [13]:
# Step 1: Extract the Enron dataset from the ZIP file
zipfile_path = r"C:\Users\Shalma\Downloads\enron1.zip"
extract_to = r"C:\Users\Shalma\project documents"
with zipfile.ZipFile(zipfile_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

In [11]:
# Step 3: Train your model using scikit-learn
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(emails)
y = labels
classifier = LogisticRegression()
classifier.fit(X, y)

In [12]:
# Step 4: Evaluate the model
y_pred = classifier.predict(X)
accuracy = accuracy_score(y, y_pred)
classification_rep = classification_report(y, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

Accuracy: 0.9947826086956522
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00      3672
           1       0.98      1.00      0.99      1503

    accuracy                           0.99      5175
   macro avg       0.99      1.00      0.99      5175
weighted avg       0.99      0.99      0.99      5175



In [15]:
# Step 5: Extract keywords
feature_names = vectorizer.get_feature_names_out()
coefficients = classifier.coef_[0]
word_coefficient_map = dict(zip(feature_names, coefficients))


In [20]:
# Sort the word_coefficient_map by coefficient value in descending order
sorted_words_ham = sorted(word_coefficient_map.items(), key=lambda x: x[1], reverse=True)

# Print the top words for Ham
top_n = 20  # Define the number of top words to print
print("\nTop", top_n, "words for Ham:")
for word, coef in sorted_words_ham[:top_n]:
    print(word, ":", coef)

# Sort the word_coefficient_map by coefficient value in ascending order for Spam
sorted_words_spam = sorted(word_coefficient_map.items(), key=lambda x: x[1])

# Print the top words for Spam
print("\nTop", top_n, "words for Spam:")
for word, coef in sorted_words_spam[:top_n]:
    print(word, ":", coef)



Top 20 words for Ham:
http : 2.8266655270364582
no : 2.567478170859084
here : 2.2969382174771686
money : 2.0255666164712136
more : 1.9600673808352733
your : 1.9215537272647256
2004 : 1.881919169330377
of : 1.7797683568363354
only : 1.6207817108935287
best : 1.4846800922002688
statements : 1.4543507751061133
paliourg : 1.432340836945703
prices : 1.4310019824998903
our : 1.3842984736487485
removed : 1.3564166089538328
meds : 1.3492785408094747
computron : 1.3491413890963218
online : 1.3476543834018864
remove : 1.2973984460906642
click : 1.2915937945680953

Top 20 words for Spam:
enron : -4.778174685305152
xls : -3.654623462715288
meter : -3.5281700716851154
2000 : -3.4505621892562304
deal : -3.4359823341758893
the : -3.372180479620564
attached : -3.2939555134113125
hpl : -3.1136032200581
thanks : -3.035153676943212
2001 : -2.962600124488794
gas : -2.960563403620033
please : -2.8982346283814864
will : -2.8900786616019443
daren : -2.837583052911683
to : -2.8099462345341153
ect : -2.762489