In [2]:
import os
import email
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import precision, recall, f_measure, ConfusionMatrix
from sklearn.model_selection import KFold
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
# Step 1: Data Processing and Tokenization

def read_emails(folder_path):
    emails = []
    for filename in os.listdir(folder_path):
        with open(os.path.join(folder_path, filename), "r", encoding="latin1") as file:
            emails.append(file.read())
    return emails

ham_folder = "/Users/pavan/Downloads/FinalProjectData/EmailSpamCorpora/corpus/ham"
spam_folder = "/Users/pavan/Downloads/FinalProjectData/EmailSpamCorpora/corpus/spam"

ham_emails = read_emails(ham_folder)
spam_emails = read_emails(spam_folder)

In [3]:
# Step 2: Feature Extraction
def extract_features(emails):
    features = []
    for email_text in emails:
        tokens = word_tokenize(email_text)
        filtered_tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stopwords.words('english')]
        features.extend(filtered_tokens)
    return features

# Extract features from ham and spam emails
ham_features = extract_features(ham_emails)
spam_features = extract_features(spam_emails)

# Select top N most frequent words as unigram features
N = 1000
all_features = FreqDist(ham_features + spam_features)
word_features = [feature for feature, _ in all_features.most_common(N)]

def email_features(email_text):
    email_words = set(word_tokenize(email_text))
    features = {}
    for word in word_features:
        features[word] = (word in email_words)
    return features

# Extract bigram features using collocations
bigram_measures = BigramAssocMeasures()
ham_bigram_finder = BigramCollocationFinder.from_words(ham_features)
ham_bigram_scores = ham_bigram_finder.score_ngrams(bigram_measures.raw_freq)
spam_bigram_finder = BigramCollocationFinder.from_words(spam_features)
spam_bigram_scores = spam_bigram_finder.score_ngrams(bigram_measures.raw_freq)
bigram_features = [bigram for bigram, _ in (ham_bigram_scores + spam_bigram_scores)[:N]]

def email_features_with_bigrams(email_text):
    email_words = set(word_tokenize(email_text))
    features = {}
    for word in word_features:
        features[word] = (word in email_words)
    for bigram in bigram_features:
        features[bigram] = (bigram in email_words)
    return features

# Define new feature function using POS tag counts
def pos_features(email_text):
    tokens = word_tokenize(email_text)
    tagged_tokens = nltk.pos_tag(tokens)
    features = {}
    for _, tag in tagged_tokens:
        features[tag] = features.get(tag, 0) + 1
    return features

# Convert emails into feature sets
ham_feature_sets = [(email_features(email), 'ham') for email in ham_emails]
spam_feature_sets = [(email_features(email), 'spam') for email in spam_emails]
unigram_feature_sets = [(email_features(email), 'ham') for email in ham_emails] + [(email_features(email), 'spam') for email in spam_emails]
bigram_feature_sets = [(email_features_with_bigrams(email), 'ham') for email in ham_emails] + [(email_features_with_bigrams(email), 'spam') for email in spam_emails]
pos_feature_sets = [(pos_features(email), 'ham') for email in ham_emails] + [(pos_features(email), 'spam') for email in spam_emails]
all_feature_sets = [(email_features(email), 'ham') for email in ham_emails] + [(email_features(email), 'spam') for email in spam_emails]


In [3]:
from sklearn.metrics import classification_report
from nltk.metrics import ConfusionMatrix
from sklearn.model_selection import KFold

k = 5  # Number of folds for cross-validation
kf = KFold(n_splits=k)

# Function to train and evaluate the classifier
def train_and_evaluate(classifier, train_set, test_set):
    classifier = classifier.train(train_set)

    # Evaluate the classifier on the test set
    true_labels = [label for _, label in test_set]
    predicted_labels = [classifier.classify(features) for features, _ in test_set]

    # Calculate evaluation measures
    accuracy = nltk.classify.accuracy(classifier, test_set)
    confusion_matrix = ConfusionMatrix(true_labels, predicted_labels)
    report = classification_report(true_labels, predicted_labels, output_dict=True)

    return accuracy, report['accuracy'], report

# Perform cross-validation and evaluation for Naive Bayes classifier with unigram features
print("Results for Naive Bayes Classifier with unigram features:")
nb_classifier_unigram = NaiveBayesClassifier.train(unigram_feature_sets)

total_accuracy_unigram = 0
total_report_unigram = None

for train_index, test_index in kf.split(unigram_feature_sets):
    train_set = [unigram_feature_sets[i] for i in train_index]
    test_set = [unigram_feature_sets[i] for i in test_index]
    accuracy, _, report = train_and_evaluate(nb_classifier_unigram, train_set, test_set)
    total_accuracy_unigram += accuracy

    if total_report_unigram is None:
        total_report_unigram = report
    else:
        for label, metrics in report.items():
            if label != 'accuracy':
                total_report_unigram[label]['precision'] += metrics['precision']
                total_report_unigram[label]['recall'] += metrics['recall']
                total_report_unigram[label]['f1-score'] += metrics['f1-score']
    
num_iterations = kf.get_n_splits()
average_accuracy_unigram = total_accuracy_unigram / num_iterations

print("Average Accuracy (Unigram):", average_accuracy_unigram)

for label, metrics in total_report_unigram.items():
    if label != 'accuracy':
        metrics['precision'] /= num_iterations
        metrics['recall'] /= num_iterations
        metrics['f1-score'] /= num_iterations
        print(f"\nAverage metrics for '{label}':")
        print("Precision:", metrics['precision'])
        print("Recall:", metrics['recall'])
        print("F1-score:", metrics['f1-score'])

# Perform cross-validation and evaluation for Naive Bayes classifier with bigram features
print("\nResults for Naive Bayes Classifier with bigram features:")
nb_classifier_bigram = NaiveBayesClassifier.train(bigram_feature_sets)

total_accuracy_bigram = 0
total_report_bigram = None

for train_index, test_index in kf.split(bigram_feature_sets):
    train_set = [bigram_feature_sets[i] for i in train_index]
    test_set = [bigram_feature_sets[i] for i in test_index]
    accuracy, _, report = train_and_evaluate(nb_classifier_bigram, train_set, test_set)
    total_accuracy_bigram += accuracy

    if total_report_bigram is None:
        total_report_bigram = report
    else:
        for label, metrics in report.items():
            if label != 'accuracy':
                total_report_bigram[label]['precision'] += metrics['precision']
                total_report_bigram[label]['recall'] += metrics['recall']
                total_report_bigram[label]['f1-score'] += metrics['f1-score']
    
average_accuracy_bigram = total_accuracy_bigram / num_iterations

print("Average Accuracy (Bigram):", average_accuracy_bigram)

for label, metrics in total_report_bigram.items():
    if label != 'accuracy':
        metrics['precision'] /= num_iterations
        metrics['recall'] /= num_iterations
        metrics['f1-score'] /= num_iterations
        print(f"\nAverage metrics for '{label}':")
        print("Precision:", metrics['precision'])
        print("Recall:", metrics['recall'])
        print("F1-score:", metrics['f1-score'])

# Perform cross-validation and evaluation for Naive Bayes classifier with POS features
print("\nResults for Naive Bayes Classifier with POS features:")
nb_classifier_pos = NaiveBayesClassifier.train(pos_feature_sets)

total_accuracy_pos = 0
total_report_pos = None

for train_index, test_index in kf.split(pos_feature_sets):
    train_set = [pos_feature_sets[i] for i in train_index]
    test_set = [pos_feature_sets[i] for i in test_index]
    accuracy, _, report = train_and_evaluate(nb_classifier_pos, train_set, test_set)
    total_accuracy_pos += accuracy

    if total_report_pos is None:
        total_report_pos = report
    else:
        for label, metrics in report.items():
            if label != 'accuracy':
                total_report_pos[label]['precision'] += metrics['precision']
                total_report_pos[label]['recall'] += metrics['recall']
                total_report_pos[label]['f1-score'] += metrics['f1-score']
    
average_accuracy_pos = total_accuracy_pos / num_iterations

print("Average Accuracy (POS):", average_accuracy_pos)

for label, metrics in total_report_pos.items():
    if label != 'accuracy':
        metrics['precision'] /= num_iterations
        metrics['recall'] /= num_iterations
        metrics['f1-score'] /= num_iterations
        print(f"\nAverage metrics for '{label}':")
        print("Precision:", metrics['precision'])
        print("Recall:", metrics['recall'])
        print("F1-score:", metrics['f1-score'])

# Perform cross-validation and evaluation for Naive Bayes classifier with all features
print("\nResults for Naive Bayes Classifier with all features:")
nb_classifier_all = NaiveBayesClassifier.train(all_feature_sets)

total_accuracy_all = 0
total_report_all = None

for train_index, test_index in kf.split(all_feature_sets):
    train_set = [all_feature_sets[i] for i in train_index]
    test_set = [all_feature_sets[i] for i in test_index]
    accuracy, _, report = train_and_evaluate(nb_classifier_all, train_set, test_set)
    total_accuracy_all += accuracy

    if total_report_all is None:
        total_report_all = report
    else:
        for label, metrics in report.items():
            if label != 'accuracy':
                total_report_all[label]['precision'] += metrics['precision']
                total_report_all[label]['recall'] += metrics['recall']
                total_report_all[label]['f1-score'] += metrics['f1-score']
    
average_accuracy_all = total_accuracy_all / num_iterations

print("Average Accuracy (All Features):", average_accuracy_all)

for label, metrics in total_report_all.items():
    if label != 'accuracy':
        metrics['precision'] /= num_iterations
        metrics['recall'] /= num_iterations
        metrics['f1-score'] /= num_iterations
        print(f"\nAverage metrics for '{label}':")
        print("Precision:", metrics['precision'])
        print("Recall:", metrics['recall'])
        print("F1-score:", metrics['f1-score'])



Results for Naive Bayes Classifier with unigram features:


  _warn_prf(average, modifier, msg_start, len(result))


Average Accuracy (Unigram): 0.9346587054635158

Average metrics for 'ham':
Precision: 0.7981167608286253
Recall: 0.7308951181368479
F1-score: 0.7629977183081735

Average metrics for 'spam':
Precision: 0.38330019880715704
Recall: 0.3959198412764297
F1-score: 0.38932746030743126

Average metrics for 'macro avg':
Precision: 0.5907084798178912
Recall: 0.5634074797066388
F1-score: 0.5761625893078024

Average metrics for 'weighted avg':
Precision: 0.9914392773644046
Recall: 0.9346587054635158
F1-score: 0.9615087526943693

Results for Naive Bayes Classifier with bigram features:
Average Accuracy (Bigram): 0.9346587054635158

Average metrics for 'ham':
Precision: 0.7981167608286253
Recall: 0.7308951181368479
F1-score: 0.7629977183081735

Average metrics for 'spam':
Precision: 0.38330019880715704
Recall: 0.3959198412764297
F1-score: 0.38932746030743126

Average metrics for 'macro avg':
Precision: 0.5907084798178912
Recall: 0.5634074797066388
F1-score: 0.5761625893078024

Average metrics for 'we

In [7]:
# Step 2: Experiment with Stop Words
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold

# Step 1: Extract features with stopwords
stop_words = set(stopwords.words('english'))

def extract_features_with_stopwords(emails):
    features = []
    for email_text in emails:
        tokens = word_tokenize(email_text)
        filtered_tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
        features.extend(filtered_tokens)
    return features

ham_features_with_stopwords = extract_features_with_stopwords(ham_emails)
spam_features_with_stopwords = extract_features_with_stopwords(spam_emails)
all_features_with_stopwords = FreqDist(ham_features_with_stopwords + spam_features_with_stopwords)
word_features_with_stopwords = [feature for feature, _ in all_features_with_stopwords.most_common(N)]

def email_features_with_stopwords(email_text):
    email_words = set(word_tokenize(email_text))
    features = {}
    for word in word_features_with_stopwords:
        features[word] = (word in email_words)
    return features

unigram_feature_sets_with_stopwords = [(email_features_with_stopwords(email), 'ham') for email in ham_emails] + [(email_features_with_stopwords(email), 'spam') for email in spam_emails]

print("Results for Unigram Features with Stopword Filtering:")
kf = KFold(n_splits=3, shuffle=True)

for train_index, test_index in kf.split(unigram_feature_sets_with_stopwords):
    train_set = [unigram_feature_sets_with_stopwords[i] for i in train_index]
    test_set = [unigram_feature_sets_with_stopwords[i] for i in test_index]
    nb_classifier_with_stopwords = nltk.NaiveBayesClassifier.train(train_set)    
    true_labels = [label for _, label in test_set]
    predicted_labels = [nb_classifier_with_stopwords.classify(features) for features, _ in test_set]    
    report = classification_report(true_labels, predicted_labels, output_dict=True)
    accuracy = report['accuracy']
    precision = report['macro avg']['precision']
    recall = report['macro avg']['recall']
    f1_score = report['macro avg']['f1-score']    
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1_score)
    print("Classification Report:")
    print(classification_report(true_labels, predicted_labels))


    

Results for Unigram Features with Stopword Filtering:
Accuracy: 0.9385150812064965
Precision: 0.9097772642196424
Recall: 0.9561160235798499
F1 Score: 0.9278812533248721
Classification Report:
              precision    recall  f1-score   support

         ham       1.00      0.92      0.96      1244
        spam       0.82      1.00      0.90       480

    accuracy                           0.94      1724
   macro avg       0.91      0.96      0.93      1724
weighted avg       0.95      0.94      0.94      1724

Accuracy: 0.9373549883990719
Precision: 0.9148902258079473
Recall: 0.9543916667195643
F1 Score: 0.9297870896214675
Classification Report:
              precision    recall  f1-score   support

         ham       1.00      0.91      0.95      1198
        spam       0.83      1.00      0.91       526

    accuracy                           0.94      1724
   macro avg       0.91      0.95      0.93      1724
weighted avg       0.95      0.94      0.94      1724

Accuracy: 0.9373

In [8]:
# Step 2: Experiment with Negation Representation
negation_words = set(["not", "no", "n't"])

def email_features_with_negation(email_text):
    email_words = set(word_tokenize(email_text))
    features = {}
    for word in word_features_with_stopwords:
        features[word] = (word in email_words)
        features["not_" + word] = ("not" in email_words and word in email_words)
    return features

unigram_feature_sets_with_negation = [(email_features_with_negation(email), 'ham') for email in ham_emails] + [(email_features_with_negation(email), 'spam') for email in spam_emails]

print("Results for Unigram Features with Negation Representation:")
for train_index, test_index in kf.split(unigram_feature_sets_with_negation):
    train_set = [unigram_feature_sets_with_negation[i] for i in train_index]
    test_set = [unigram_feature_sets_with_negation[i] for i in test_index]

    nb_classifier_with_negation = nltk.NaiveBayesClassifier.train(train_set)

    true_labels = [label for _, label in test_set]
    predicted_labels = [nb_classifier_with_negation.classify(features) for features, _ in test_set]

    report = classification_report(true_labels, predicted_labels, output_dict=True)
    accuracy = report['accuracy']
    precision = report['macro avg']['precision']
    recall = report['macro avg']['recall']
    f1_score = report['macro avg']['f1-score']

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1_score)
    print("Classification Report:")
    print(classification_report(true_labels, predicted_labels))

Results for Unigram Features with Negation Representation:
Accuracy: 0.9425754060324826
Precision: 0.9198262330926397
Recall: 0.9562932115638165
F1 Score: 0.9344493489524823
Classification Report:
              precision    recall  f1-score   support

         ham       1.00      0.92      0.96      1210
        spam       0.84      0.99      0.91       514

    accuracy                           0.94      1724
   macro avg       0.92      0.96      0.93      1724
weighted avg       0.95      0.94      0.94      1724

Accuracy: 0.9431554524361949
Precision: 0.9184416206068675
Recall: 0.9576246370291603
F1 Score: 0.9342005234297108
Classification Report:
              precision    recall  f1-score   support

         ham       1.00      0.92      0.96      1225
        spam       0.84      0.99      0.91       499

    accuracy                           0.94      1724
   macro avg       0.92      0.96      0.93      1724
weighted avg       0.95      0.94      0.94      1724

Accuracy: 0