In [13]:
# Final Project
# Email SPAM filter
import os

# Read all the data from the spam and ham directories

# start lists for spam and ham email texts
hamtexts = []
spamtexts = []
    
for file in os.listdir("./EmailSpamCorpora/corpus/spam"):
    if (file.endswith(".txt")):
        f = open("EmailSpamCorpora/corpus/spam/" + file, 'r', encoding="latin-1")
        spamtexts.append(f.read())
        f.close()
        
for file in os.listdir("./EmailSpamCorpora/corpus/ham"):
    if (file.endswith(".txt")):
        f = open("./EmailSpamCorpora/corpus/ham/" + file, 'r', encoding="latin-1")
        hamtexts.append(f.read())
        f.close()

print("Number of spam files:",len(spamtexts))
print("Number of ham files:", len(hamtexts))

Number of spam files: 1500
Number of ham files: 3672


In [44]:
import nltk
# Combine all the spam and ham into a single data-structure
emaildocs = []

# Before we create word tokens, let's filter the tokenized words for:
# - stop-words (using nltk.stopwords)
# - remove common email characters like Subject..
# - remove characters like :.,$#?* etc.

stopwords = nltk.corpus.stopwords.words('english')
my_words = ['Subject', 'com', 'http', 'www']
my_chars = [',', '.', '?', '%', '#', '*', '$', 
            ':', '/', '\\', ';', '&', '@', '-',
            '\'', '_', '[', ']', '(', ')',
            '!', '\'\'', '``', '{', '}']
stopwords.extend(my_words)
stopwords.extend(my_chars)

# Use the stopwords + filters and measure reduction benefit
tokens_all_count = 0
tokens_filter_count = 0
for spam in spamtexts:
    # word tokenize it
    tokens_all = nltk.word_tokenize(spam)
    # first, we will put all unique words in the email
    tokens_unique = set(tokens_all)
    tokens_filter = list(tokens_unique - set(stopwords))
    tokens_all_count += len(tokens_all)
    tokens_filter_count += len(tokens_filter)
    # print(len(tokens_filter))
    # break
    #emaildocs.append((tokens, 'spam'))

reduction_pct = (tokens_all_count - tokens_filter_count) / tokens_all_count * 100
print('Tokens(All) = %d vs Tokens(Filtered) = %d' %(tokens_all_count, tokens_filter_count))

print('Overall reduction = %s' %reduction_pct)


Tokens(All) = 355375 vs Tokens(Filtered) = 142146
Overall reduction = 60.001125571579315


In [45]:
# Let's now make this a function so, we can form our data-set:
# [(<tokenize-words), <spam/ham>)...]
# list of tuples

def create_dataset(rawtext, tag):
    tmpdocs = []
    for text in rawtext:
        tokens_all = nltk.word_tokenize(text)
        tokens_unique = set(tokens_all)
        tokens_filter = list(tokens_unique - set(stopwords))
        tmpdocs.append((tokens_filter, tag))
    return tmpdocs

spam = create_dataset(spamtexts, 'spam')
ham = create_dataset(hamtexts, 'ham')

# Check a few of them before we combine the lists
print(spam[:5])
print(ham[:5])

spam_n_ham = spam + ham

[(['time', 'road', 'naturalgolden', 'ress', 'companion', 'find', 'terrific', 'youll', 'www', 'turn', 'bio', 'love', 'cam', 'developed', 'meganbang', 'site', 'movie', 'line', 'try', 'help', 'amazed', 'date', 'encomia', 'form', 'matter', 'catatonia', 'fashioned', 'intervenor', 'catfish', 'every', 'see', 'acc', 'brandywine', 'good', 'preemptive', 'babe', 'plz', 'war', 'shoehorn', 'word', 'sense', 'browser', 'ole', 'skeleton', 'retract', 'satisfaction', 'electrocardiograph', 'counterattack', 'biz', 'quick', 'scaup', 'evening', 'brand', 'lookup', 'created', 'copy', 'step', 'byrne', 'friendship', 'may', 'looking', 'pietism', 'anyone', 'pa', 'monster', 'ste', 'come', 'honeycomb', 'new', 'bld', 'add', 'aitken'], 'spam'), (['random', 'turnkey', 'want', 'money', 'future', 'like', 'mailings', 'tracking', 'userid', 'receive', 'would', 'complete', 'word', 'clck', 'express', 'learn', 'federal', 'videos', 'number', 'system', 'make', 'shipped', 'registering', 'fortune', 'via', 'thank', 'order', 'confi

In [47]:
# Shuffle the data prior modelling it
import random
random.shuffle(spam_n_ham)

# See the format:
#.  list of tuples where the tuple is (<tokenized-words>, category = <spam/ham>)
spam_n_ham[0]

(['remote',
  'www',
  'back',
  'sport',
  'special',
  'avocation',
  'channels',
  'allow',
  'receive',
  'control',
  'despoil',
  '8006',
  'events',
  'axxxmovies',
  'hosting',
  'cable',
  'emile',
  'payperviews',
  'order',
  'cablefilterz'],
 'spam')

In [51]:
# We'll find the 2000 most common words and use them as an 
# important feature of the whole corpus
def unigram_freq(docs):
    all_words = []
    # Write a regex to pull only the word portion & leave 
    # out any punctuation marks etc.
    for (word_tokens, category) in docs:
        for word in word_tokens:
            # Not writing a regex here since we want to know if 
            # random patterns or words would cause an email to be spam
            all_words.append(word)
    top_words = nltk.FreqDist(all_words)
    most_common_words = top_words.most_common(2000)
    word_features = [word for (word,count) in most_common_words]
    return all_words, word_features

# uni_features now has the top-2000 most common words
# across the entire spam and ham data
all_words, uni_features = unigram_freq(spam_n_ham)

print(all_words[:5], uni_features[:5])

['remote', 'www', 'back', 'sport', 'special'] ['please', '2000', 'subject', 'enron', 'thanks']


In [61]:
def document_features(document, word_features):
    document_words = set(document)
    # we open a Pytnon dictionary instead of a list
    features = {}
    for word in word_features:
        #checking if the word from word_features matches a word in the document
        features['contains({})'.format(word)] = (word in document_words)
    return features

# Essentially, when we call document_features(), we should have a feature-set:
# contains(<word>): <True/False>, category = spam/ham
# .
# .
# upto 2k words & this repeats for every tokenized word in spam_n_ham
# which is based on uni-gram tokens.

uni_featuresets = [(document_features(d, uni_features), c) for (d, c) in spam_n_ham]
# print(uni_featuresets[0])

# Also, curious about the category and what words were True for
# the first featureset
words = []
for feature, _bool in uni_featuresets[0][0].items():
    if _bool == True:
        _word = feature.split('(')[1].split(')')[0]
        words.append(_word)
print('Email classified %s has the following words: %s' %(uni_featuresets[0][1], words))

Email classified spam has the following words: ['back', 'www', 'order', 'receive', 'special', 'control', 'allow', 'events']


In [62]:
# Let's run the Naive Bayes classification algorithm
# and measure the accuracy.

import numpy as np
from sklearn.model_selection import KFold

def ml_nb(featuresets):
    kf = KFold(n_splits = 5)
    sum = 0

    for train, test in kf.split(featuresets):
        train_data = np.array(featuresets)[train]
        test_data = np.array(featuresets)[test]
        classifier = nltk.NaiveBayesClassifier.train(train_data)
        sum += nltk.classify.accuracy(classifier, test_data)
        
        
    #storing the score in a variable 
    acc1 = sum/5
    
    return  classifier, acc1

# Let's call the function ml_nb which splits the data based on
# cross-validation and fold/k = 5
uni_classifier, uni_accuracy = ml_nb(uni_featuresets)
print(uni_accuracy)

0.9566889991496836


In [68]:
# Let's try the same with bi-grams to see if we get better
# accuracy.

from nltk.collocations import *

def bigram_freq(all_words):
    #creating bigrams features for the corpus and applying cleaning steps
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(all_words)
    scored = finder.score_ngrams(bigram_measures.raw_freq)
    
    #extracting clean bigrams (no frequency information)
    bigram_features = [bigram for (bigram, count) in scored[:2000]]
    
    return bigram_features


bi_features = bigram_freq(all_words)
print(bi_features[:5])

def bi_document_features(document, bigram_features):
    document_words = list(nltk.bigrams(document))
    features = {}
    for word in bigram_features:
        #boolean logic will return 'True' if there is a match, or 'False' if not
        features['contains({})'.format(word)] = (word in document_words)
    return features


bi_featuresets = [(bi_document_features(d, bi_features), c) for (d, c) in spam_n_ham]

words = []
for feature, _bool in bi_featuresets[0][0].items():
    if _bool == True:
        words.append(feature)

print('Email classified %s has the following words: %s' %(bi_featuresets[0][1], words))

[('01', 'cc'), ('xls', '2000'), ('pm', '2000'), ('daren', 'pm'), ('see', 'hpl')]
Email classified spam has the following words: ["contains(('receive', 'control'))", "contains(('allow', 'receive'))"]


In [69]:
# Let's call the function ml_nb which splits the data based on
# cross-validation and fold/k = 5
bi_classifier, bi_accuracy = ml_nb(bi_featuresets)
print(bi_accuracy)

0.8383556190956746


In [78]:
# Given, the accuracy is really good especially for uni-gram classifier,
# let us also fetch the f-measure 

# Also, we will use the classifier we obtained from the model for both 
# the uni-gram & bi-gram classifiers. Further, since we don't have separate 
# test-data, we will sample 20% of the bottom end of the data as test.

test_len = int(0.2 * len(spam_n_ham))
test_data = spam_n_ham[:test_len]

actual = []
predicted = []

for words, cat in test_data:
    predict = uni_classifier.classify(document_features(words, uni_features))
    predicted.append(predict)
    actual.append(cat)

print(actual[:10], predicted[:10])    

['spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham'] ['spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham']


In [79]:
# Utilizing a function from labs let's now obtain the eval_measures
def eval_measures(gold, predicted):
    # get a list of labels
    labels = list(set(gold))
    # these lists have values for each label 
    recall_list = []
    precision_list = []
    F1_list = []
    for lab in labels:
        # for each label, compare gold and predicted lists and compute values
        TP = FP = FN = TN = 0
        for i, val in enumerate(gold):
            if val == lab and predicted[i] == lab:  TP += 1
            if val == lab and predicted[i] != lab:  FN += 1
            if val != lab and predicted[i] == lab:  FP += 1
            if val != lab and predicted[i] != lab:  TN += 1
        # use these to compute recall, precision, F1
        recall = TP / (TP + FP)
        precision = TP / (TP + FN)
        recall_list.append(recall)
        precision_list.append(precision)
        F1_list.append( 2 * (recall * precision) / (recall + precision))

    # the evaluation measures in a table with one row per label
    print('\tPrecision\tRecall\t\tF1')
    # print measures for each label
    for i, lab in enumerate(labels):
        print(lab, '\t', "{:10.3f}".format(precision_list[i]), \
          "{:10.3f}".format(recall_list[i]), "{:10.3f}".format(F1_list[i]))

# Eval measures for uni-gram classifier
eval_measures(actual, predicted)

	Precision	Recall		F1
spam 	      0.993      0.882      0.935
ham 	      0.945      0.997      0.971


In [80]:
# And repeat the same for bi-gram classifier too
actual = []
predicted = []

for words, cat in test_data:
    predict = bi_classifier.classify(bi_document_features(words, bi_features))
    predicted.append(predict)
    actual.append(cat)

print(actual[:10], predicted[:10])    

['spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham'] ['spam', 'spam', 'ham', 'ham', 'ham', 'ham', 'spam', 'spam', 'ham', 'ham']


In [81]:
# Eval measures for bi-gram classifier
eval_measures(actual, predicted)

	Precision	Recall		F1
spam 	      0.993      0.682      0.809
ham 	      0.809      0.997      0.893
