In [1]:
'''
  This program shell reads email data for the spam classification problem.
  The input to the program is the path to the Email directory "corpus" and a limit number.
  The program reads the first limit number of ham emails and the first limit number of spam.
  It creates an "emaildocs" variable with a list of emails consisting of a pair
    with the list of tokenized words from the email and the label either spam or ham.
  It prints a few example emails.
  Your task is to generate features sets and train and test a classifier.

  Usage:  python classifySPAM.py  <corpus directory path> <limit number>
'''
# open python and nltk packages needed for processing
import os
import sys
import random
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# define a feature definition function here
def document_features(document, word_features):

  document_words = set(document)
  features = {}
  for word in word_features:
      features['contains({})'.format(word)] = (word in document_words)
  return features

In [3]:
# define a feature definition function here
def document_features_ps(document, word_features):

  porter_stemmer = PorterStemmer()
  stop_words = set(stopwords.words('english'))
  
  document_words = [porter_stemmer.stem(word) for word in document if word.lower() not in stop_words]
  features = {}
  for word in word_features:
      features['contains({})'.format(word)] = (word in document_words)
  return features

In [4]:
# define a feature definition function here
def document_features_wc(document, word_features):

  porter_stemmer = PorterStemmer()
  stop_words = set(stopwords.words('english'))

  document_words = document_words = [porter_stemmer.stem(word) for word in document if word.lower() not in stop_words]
  features = {}
  for word in word_features:
      features['count({})'.format(word)] = (word_features.count(word) in document_words)
  return features

In [5]:
# define a feature definition function here
def document_features_lem(document, word_features):

  lemmatizer = WordNetLemmatizer()
  stop_words = set(stopwords.words('english'))
  
  document_words = [lemmatizer.lemmatize(word) for word in document if word.lower() not in stop_words]
  features = {}
  for word in word_features:
      features['contains({})'.format(word)] = (word in document_words)
  return features

In [6]:
def classifier_train_result(emaildocs, word_features, ps=False, wc=False, lem=False, svm=False, rf=False):

  # feature sets from a feature definition function
  featuresets = ()
  if lem:
    featuresets = [(document_features_lem(doc,word_features), label) for (doc, label) in emaildocs]
  else:
    featuresets = [(document_features(doc,word_features), label) for (doc, label) in emaildocs] if not (ps or wc) else [(document_features_ps(doc,word_features), label) for (doc, label) in emaildocs] if (ps and not wc) else [(document_features_wc(doc,word_features), label) for (doc, label) in emaildocs]
  
  # train classifier and show performance in cross-validation
  # randomize the feature sets
  random.shuffle(featuresets)
  
  # split the feature sets into training and testing sets
  train_set, test_set = featuresets[:int(len(featuresets) * 0.8)], featuresets[int(len(featuresets) * 0.8):]

  if not rf and not svm:
    #If we don't convert email texts to list of strings, we just get the accuracy for Naive Bayes Classifier
    # train classifier and show performance in cross-validation
    classifier_simple = nltk.NaiveBayesClassifier.train(train_set)
    
    # print the accuracy of the classifier on the test set
    print("\nSimple Classifier accuracy:", nltk.classify.accuracy(classifier_simple, test_set))
  
  # Convert email texts to a list of strings
  training_emails = [' '.join(email) for email, _ in train_set]
  testing_emails = [' '.join(email) for email, _ in test_set]

  # Use TF-IDF vectorizer to convert emails to feature vectors
  vectorizer = TfidfVectorizer()
  X_train = vectorizer.fit_transform(training_emails)
  X_test = vectorizer.transform(testing_emails)
  
  # Get the corresponding labels
  y_train = [label for _, label in train_set]
  y_test = [label for _, label in test_set]

  # Train classifier
  classifier = RandomForestClassifier() if rf else SVC(kernel='linear') if svm else MultinomialNB()
  classifier.fit(X_train, y_train)

  # Predict labels for testing set
  y_pred = classifier.predict(X_test)

  print("\nAccuracy:", classifier.score(X_test, y_test))
  print("Precision:", precision_score(y_test, y_pred, pos_label='spam'))
  print("Recall:", recall_score(y_test, y_pred, pos_label='spam'))
  print("F-measure:", f1_score(y_test, y_pred, pos_label='spam'))

In [7]:
# function to read spam and ham files, train and test a classifier 
def processspamham(dirPath,limitStr):
  # convert the limit argument from a string to an int
  limit = int(limitStr)
  
  # start lists for spam and ham email texts
  hamtexts = []
  spamtexts = []
  os.chdir(dirPath)
  # process all files in directory that end in .txt up to the limit
  #    assuming that the emails are sufficiently randomized
  for file in os.listdir("./spam"):
    if (file.endswith(".txt")) and (len(spamtexts) < limit):
      # open file for reading and read entire file into a string
      f = open("./spam/"+file, 'r', encoding="latin-1")
      spamtexts.append (f.read())
      f.close()
  for file in os.listdir("./ham"):
    if (file.endswith(".txt")) and (len(hamtexts) < limit):
      # open file for reading and read entire file into a string
      f = open("./ham/"+file, 'r', encoding="latin-1")
      hamtexts.append (f.read())
      f.close()
  
  # print number emails read
  print ("Number of spam files:",len(spamtexts))
  print ("Number of ham files:",len(hamtexts))
  print

  # create list of mixed spam and ham email documents as (list of words, label)
  emaildocs = []
  # add all the spam
  for spam in spamtexts:
    tokens = nltk.word_tokenize(spam)
    emaildocs.append((tokens, 'spam'))
  # add all the regular emails
  for ham in hamtexts:
    tokens = nltk.word_tokenize(ham)
    emaildocs.append((tokens, 'ham'))
  
  # randomize the list
  random.shuffle(emaildocs)
  
  # print a few token lists
  for email in emaildocs[:4]:
    print (email)
    print
      
  # possibly filter tokens
  stop_words = set(stopwords.words('english'))
  filtered_tokens = [word for email in emaildocs for word in email[0] if word.lower() not in stop_words]

  # continue as usual to get all words and create word features
  all_words = nltk.FreqDist(filtered_tokens)
  word_features = list(all_words.keys())[:2000]

  return emaildocs, word_features

In [8]:
def multi_nb(corpus_dir, limit):
  #Multinomial Naive Bayes
  print("\nMultinomial Naive Bayes\n")
  emaildocs, word_features = processspamham(corpus_dir, limit)
  classifier_train_result(emaildocs, word_features)

In [9]:
def multi_nb_ps(corpus_dir, limit):
  #Multinomial Naive Bayes with porter stemmer
  print("\nMultinomial Naive Bayes with Porter Stemmer\n")
  emaildocs, word_features = processspamham(corpus_dir, limit)
  classifier_train_result(emaildocs, word_features, ps=True)

In [10]:
def multi_nb_wc(corpus_dir, limit):
  #Multinomial Naive Bayes with word count
  print("\nMultinomial Naive Bayes (word count) with Porter Stemmer\n")
  emaildocs, word_features = processspamham(corpus_dir, limit)
  classifier_train_result(emaildocs, word_features, wc=True)

In [11]:
def multi_nb_lem(corpus_dir, limit):
  #Multinomial Naive Bayes with lemmatizer
  print("\nMultinomial Naive Bayes with Lemmatizer\n")
  emaildocs, word_features = processspamham(corpus_dir, limit)
  classifier_train_result(emaildocs, word_features, lem=True)

In [12]:
def svm_ps(corpus_dir, limit):
  #SVM with porter stemmer
  print("\nSVM with Porter Stemmer\n")
  emaildocs, word_features = processspamham(corpus_dir, limit)
  classifier_train_result(emaildocs, word_features, ps=True, svm=True)

In [13]:
def rf(corpus_dir, limit):
  #Random Forest
  print("\nRandom Forest Classifier\n")
  emaildocs, word_features = processspamham(corpus_dir, limit)
  classifier_train_result(emaildocs, word_features, rf=True)

In [15]:
"""
commandline interface takes a directory name with ham and spam subdirectories
   and a limit to the number of emails read each of ham and spam
It then processes the files and trains a spam detection classifier.

"""
if __name__ == '__main__':
    corpus_dir = '/Users/prasunabhishek/Desktop/Syracuse Course Content - MS in Business Analytics/Syracuse Winter sem - NLP - Winter 2023-24/Project/EmailSpamCorpora/corpus'
    limit = 1000
    multi_nb(corpus_dir, limit)


Multinomial Naive Bayes

Number of spam files: 1000
Number of ham files: 1000
(['Subject', ':', 'enron', 'actuals', 'for', 'dec', '.', '27', ',', '2000', 'dec', '.', '27', ',', '2000', 'teco', 'tap', '30', '.', '000', '/', 'enron', ';', '120', '.', '000', '/', 'hpl', 'gas', 'daily', 'lsp', 'hpl', 'katy', 'i', '/', 'c', '30', '.', '000', '/', 'enron'], 'ham')
(['Subject', ':', 'select', 'eshopping', 'for', 'medicines', 'and', 'take', 'advantage', 'of', 'the', 'specials', 'at', 'our', 'cyberstore', '.', 'if', 'you', 'have', 'a', 'tight', 'budget', 'and', 'still', 'prefer', 'quality', 'tablets', 'to', 'alleviate', 'the', 'pain', ',', 'seek', 'a', 'better', 'resolution', '.', 'we', 'provide', 'the', 'pricing', 'that', 'reduce', 'your', 'expenses', 'on', 'dr', '.', 'prescribed', 'rememdies', 'for', 'pain', ',', 'swelling', ',', 'dysfunction', 'of', 'the', 'erectile', 'member', ',', 'stress', ',', 'raised', 'cholesterol', ',', 'man', "'", 's', 'care', ',', 'muscle', 'relaxing', 'and', 'slee

In [28]:
if __name__ == '__main__':
    corpus_dir = '/Users/prasunabhishek/Desktop/Syracuse Course Content - MS in Business Analytics/Syracuse Winter sem - NLP - Winter 2023-24/Project/EmailSpamCorpora/corpus'
    limit = 1000
    multi_nb_ps(corpus_dir, limit)


Multinomial Naive Bayes with Porter Stemmer

Number of spam files: 1000
Number of ham files: 1000
(['Subject', ':', 'sterling', 'balance', 'sheet', 'strengthens', 'underpriced', 'stock', 'now', 'that', 'oil', 'and', 'gas', 'has', 'entered', 'a', 'long', '-', 'term', 'bull', 'market', ',', 'our', 'speciaity', 'in', 'pinpointing', 'the', 'hottest', 'companies', 'of', 'the', 'few', 'remaining', 'undervalued', 'energy', 'plays', 'has', 'produced', 'soaring', 'returns', '.', 'montana', 'oil', 'and', 'gas', ',', 'inc', '.', '(', 'mogi', ')', 'to', 'expiore', 'further', 'opportunities', 'in', 'aiberta', 'canada', ',', 'a', 'is', 'an', 'energy', 'deveioper', 'in', 'canada', "'", 's', 'most', 'highly', 'coveted', 'reservoirs', 'with', 'generating', 'potential', 'of', 'miliions', 'per', 'week', 'symbol', '-', 'mogi', 'price', '-', '.', '47', 'increased', '11', '%', 'last', 'three', 'day', ',', 'rating', '-', 'strongbuy', 'how', 'much', 'it', 'wi', '|', '|', 'up', 'again', '?', 'the', 'vaiue', '

In [18]:
if __name__ == '__main__':
    corpus_dir = '/Users/prasunabhishek/Desktop/Syracuse Course Content - MS in Business Analytics/Syracuse Winter sem - NLP - Winter 2023-24/Project/EmailSpamCorpora/corpus'
    limit = 1000
    multi_nb_wc(corpus_dir, limit)


Multinomial Naive Bayes (word count) with Porter Stemmer

Number of spam files: 1000
Number of ham files: 1000
(['Subject', ':', 'low', 'rate', 'credit', 'credit', 'problems', '?', 'no', 'problem', '!', '-', 'we', 'can', 'erase', 'your', 'bad', 'credit', '-', '1', 'oo', '%', 'guaranted', '-', 'repair', 'your', 'credit', 'history', 'legally', '-', 'get', 'you', 'on', 'your', 'way', 'to', 'purchasing', 'that', 'new', 'home', 'or', 'new', 'car', 'with', 'ease', 'it', 'doesn', "'", 't', 'matter', 'if', 'you', 'have', 'foreclosures', ',', 'bankruptcies', ',', 'repossessions', ',', 'charge', '-', 'offs', ',', 'or', 'even', 'late', 'payments', ';', 'the', 'law', 'is', 'on', 'your', 'side', 'and', 'allows', 'them', 'to', 'be', 'legally', 'removed', '.', 'it', "'", 's', 'time', 'to', 'stop', 'worrying', 'about', 'your', 'less', 'than', 'perfect', 'credit', '.', 'let', 'us', 'help', 'you', ',', 'we', 'don', "'", 't', 'just', 'think', 'we', 'can', '.', '.', '.', 'we', 'know', 'we', 'can', '.', '

In [21]:
if __name__ == '__main__':
    corpus_dir = '/Users/prasunabhishek/Desktop/Syracuse Course Content - MS in Business Analytics/Syracuse Winter sem - NLP - Winter 2023-24/Project/EmailSpamCorpora/corpus'
    limit = 1000
    multi_nb_lem(corpus_dir, limit)


Multinomial Naive Bayes with Lemmatizer

Number of spam files: 1000
Number of ham files: 1000
(['Subject', ':', 're', ':', 'saxet', 'canales', 'meter', '980437', 'for', 'august', ',', '2000', 'sorry', 'about', 'that', ',', 'i', 'had', 'some', 'tech', 'difficulty', 'with', 'the', 'first', 'version', 'and', 'the', 'subject', 'line', ',', 'which', 'was', 'pertinent', ',', 'was', 'not', 'included', '.', 'mary', '-', 'good', 'catch', 'carlos'], 'ham')
(['Subject', ':', 'legal', 'operating', 'systems', 'for', 'a', 'third', 'of', 'the', 'price', 'catenate', 'sneermullion', 'conscientious', 'damanonymous', 'native', 'dwightshari', 'ira', 'recruittimeshare', 'precise', 'ferrise', "'", 's', 'blink', 'minimumbimonthly', 'trilogy', 'lacrossedrown', 'kirchner', 'resuscitateferry', 'baffle', 'bogycluck', 'bamberger', 'bangkokamherst', 'amos', 'mazeligand', 'propaganda', 'corpsmenfibrosis', 'lutheran', 'trayerrand', 'cringe', 'discerncabdriver', 'middle', 'tedinburgh', 'barter', 'clydesaloonkeeper',

In [26]:
if __name__ == '__main__':
    corpus_dir = '/Users/prasunabhishek/Desktop/Syracuse Course Content - MS in Business Analytics/Syracuse Winter sem - NLP - Winter 2023-24/Project/EmailSpamCorpora/corpus'
    limit = 1000
    svm_ps(corpus_dir, limit)


SVM with Porter Stemmer

Number of spam files: 1000
Number of ham files: 1000
(['Subject', ':', 'learn', 'to', 'save', 'on', 'medications', 'at', 'discount', 'pharmacy', 'hello', ',', 'save', 'your', 'health', 'your', 'money', 'get', 'all', 'the', 'medication', 'you', 'need', 'at', 'incredible', '80', '%', 'discounts', '.', 'http', ':', '/', '/', 'werwer', '4723', '.', 'com', '/', '?', 'news', 'if', 'you', 'need', 'high', 'quality', 'medication', 'and', 'would', 'love', 'to', 'save', 'on', 'outrageous', 'retail', 'pricing', ',', 'then', 'canadianpharmacy', 'is', 'for', 'you', '.', 'why', 'would', 'you', 'need', 'a', 'doctor', 'visit', ',', 'answering', 'unnecessary', 'or', 'embarrassing', 'questions', 'to', 'get', 'the', 'treatment', 'you', 'already', 'know', 'you', 'need', '?', 'choose', 'it', 'yourself', 'and', 'save', 'big', 'on', 'doctor', 'visits', 'and', 'retail', 'prices', ',', 'in', 'just', '2', 'simple', 'steps', '!', '[', 'you', 'should', 'know', 'that', 'our', 'online', 'sh

In [33]:
if __name__ == '__main__':
    corpus_dir = '/Users/prasunabhishek/Desktop/Syracuse Course Content - MS in Business Analytics/Syracuse Winter sem - NLP - Winter 2023-24/Project/EmailSpamCorpora/corpus'
    limit = 1000
    rf(corpus_dir, limit)


Random Forest Classifier

Number of spam files: 1000
Number of ham files: 1000
(['Subject', ':', 'meter', '986884', 'for', '1', '/', '2001', 'daren', '-', 'do', 'we', 'have', 'the', 'buy', '/', 'sell', 'deal', 'for', 'el', 'paso', '(', 'formerly', 'teco', ')', 'for', 'january', '01', 'prod', '?', 'the', 'old', 'deal', 'numbers', 'were', '235670', 'for', 'the', 'sale', 'and', '137870', 'for', 'the', 'purchase', '.', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', 'forwarded', 'by', 'katherine', 'herrera', '/', 'corp', '/', 'enron', 'on', '02', '/', '23', '/', '2001', '03', ':', '59', 'pm', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', 'from', ':', 'megan', 'parker', '02', '/', '23', '/', '2001', '03', ':', '57', 'pm', 'to', ':', 'katherine', 'herrera', '/', 'corp', '/', 'enron', '@', 'enron', 'cc', ':', 'michael', 'olsen', '/', 'na', '/'