## Lab Class ML:IV
## Ankit Satpute, 120825 ; Hsueh Wei, 120820; Sagar Nagaraj Simha, 120797 - (M.Sc. CS4DM) - Group 13

In [35]:
import tarfile
import random
import email
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from collections import Counter
import pandas as pd

In [36]:
#Starter Code

def iterate_emails(tar_path):
    """Extract individual email messages from the tar file located at 
       tar_path. Returns a generator object. """
    tar = tarfile.open(tar_path)
    emails = (f for f in tar if f.isfile())
    for info in emails:
        f = tar.extractfile(info)
        ## parse contents of compressed file into an Email-object:
        msg = email.message_from_binary_file(f)
        yield msg
        f.close()
        
        
def mail_text(msg):
    """Decode and extract the headers and body from the Email message 
       object given as msg, and return them as a single string. """
    headers = []
    for k, v in msg.items():
        headers.append(k)
        if type(v) is str:
            headers.append(v)
    text_parts = (p for p in msg.walk() 
                  if p.get_content_type().startswith('text'))
    contents = []
    for txt in text_parts:
        charset = txt.get_content_charset()
        try:
            ## decode MIME encoding
            payload = txt.get_payload(decode=True)
            try:
                payload = payload.decode(charset)
            except:
                ## if the charset from the header doesn't work, force UTF-8
                payload = payload.decode('utf-8', 'replace')
            contents.append(payload)
        except:
            contents.append(txt.get_payload())
    return " ".join(headers + contents)

## 4(b)

In [37]:
def get_email_text(path):
    email_list = list(iterate_emails(path))
    text_content = []
    for msgs in email_list:
        a= mail_text(msgs)
        text_content.append(a)
    return text_content

#Files are downloaded first and then used for extracting emails from them
ham = get_email_text('20030228_easy_ham_2.tar.bz2')
spam = get_email_text('20050311_spam_2.tar.bz2')
[len(spam),len(ham)]

[1397, 1401]

## 4(c)

In [38]:
# function tokenize splits emails into tokens and in this case we are getting 
# rid of everything else except alphanumeric characters in order to filter our data in begining
def tokenize(text):
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(text)

#using function tokenize to create list of words from each email

def tokenize_ham_spam(jam):
    temp1 = []
    for msgs in jam:
        temp1.append(tokenize(msgs))
    return temp1

#Now onwards this will be our original collection of Ham and spam messages

ham_re = tokenize_ham_spam(ham)
spam_re = tokenize_ham_spam(spam)
[len(spam),len(ham)]

[1397, 1401]

In [39]:
tokenize("this is a test string")

['this', 'is', 'a', 'test', 'string']

## 4(d)

In [40]:
# this function returns words as 'keys' and their occurance as 'values' in dictionaries

def word_count(words):
    c = Counter()
    for l in words:
        c.update(l)
    return dict(c)

In [41]:
#This class implements naive bayes spam filter 

class SpamClassifier(object):
    # constructor takes 2 parameters i.e. spam and ham which are already split into their constituent tokens
    # then we use word_count to calculate no. of occurances of word 
    # we create total list of words from spam and ham in order to calculate P(wi/spam) for all words
    
    def __init__(self, spam, ham):
        spam_words = word_count(spam)
        no_s = len(spam)
        ham_words = word_count(ham)
        no_h = len(ham)
        total_words = dict()
        total_words.update(ham_words)
        total_words.update(spam_words)
        self.prob_word_provi_spam = dict()
        
        # we implement formula given in the article (4(a)) for assigning probabilities based on occurances
        # then we save all the assigned probabilities into the dict so that we can use it effectively while calculating P(spam/ word)
        
        for word in total_words.keys():
            g = b = 0
            if word in spam_words:
                b = 2 * spam_words[word]
            if word in ham_words:
                g = ham_words[word]
            if (g + b > 5):
                prob = max(0.01, min(0.99, float(min(1, b/no_s) / (min(1, g/no_h) + min(1, b/no_s)))))
                self.prob_word_provi_spam[word] = prob
        
        #this represent probability of being spam which simply re[resent no of spam messages in total messages
        self.prob_spam = no_s / (no_s + no_h)

    # this is method predict for calculating prob such that new word is spam or not
    def predict(self, test_words):
        
        # we create dict of all the test email words in order to calculate interesting words
        # where interesting is measured by how far their spam probability is from a neutral .5
        # if word is not found in P(word/spam) then we assign 0.1 because abs(0.5-0.4)=0.1 always
        
        test = dict()
        for word in test_words:
            if word in self.prob_word_provi_spam:
                test[word] = abs(0.5 - self.prob_word_provi_spam[word])
            else:
                test[word] = abs(0.1)
        
        # As mentioned in the article 4(a)
        # we need to sort all the words because we are only interested in first 15 tokens 
        
        key_words = {k: v for k, v in sorted(test.items(), key=lambda i: i[1], reverse=True)}
    
        # now we calculate final prob whether given word is spam or not using baye's formula
        # note that the word from test mail which is not in our train set prob, we assign prob = 0.4 
        # also, prob(word/ham) = 1 - prob(word/spam) as its ither ham or spam
        
        word_giv_spam = 1
        word_giv_ham = 1
        for word in list(key_words.keys())[:15]:
            if word in self.prob_word_provi_spam:
                word_giv_spam *= self.prob_word_provi_spam[word]
                word_giv_ham *= (1 - self.prob_word_provi_spam[word])
            else:
                self.prob_word_provi_spam[word] = 0.4
                word_giv_spam *= self.prob_word_provi_spam[word]
                word_giv_ham *= (1 - self.prob_word_provi_spam[word])   
        final_prob = (self.prob_spam *word_giv_spam) / (self.prob_spam*word_giv_spam + self.prob_spam*word_giv_ham)
        return final_prob
        
    # this function we implemented in order to return whether the message is spam (True) or not (False) depending on the 
    # probability we calculate in 'predict' method
    # threashold is taken as 0.9 (according to aricle 4(a))
    
    def check_if_spam(self, text, thr=0.9):
        x = self.predict(text)
        tem = False
        if x > thr:
            tem = True
        return tem

In [42]:
cls = SpamClassifier(spam = [["this", "is", "spam"], ["more", "spam"]],
ham = [["this", "is", "ham"], ["more", "ham"]])
cls.predict(['is', "this", "spam", "or", "not"])

0.11636363636363642

## 4(e)

In [43]:
# randomly shuffle the messages 

random.shuffle(ham_re)
random.shuffle(spam_re)

# We select 100 random examples from each of the spam and ham classes 
# to use as a validation set, and exclude them from the training data

spam_train = spam_re[100:]
spam_valida = spam_re[:100]
ham_train = ham_re[100:]
ham_valida = ham_re[:100]

cls_1 = SpamClassifier(spam_train, ham_train)
a = b= c= d = 0
for data in spam_valida:
    if cls_1.check_if_spam(data) is True:
        a +=1
    else:
        b +=1

for data in ham_valida:
    if cls_1.check_if_spam(data) is True:
        c +=1
    else:
        d +=1
data = [[a, b], [c, d]] 
df = pd.DataFrame(data, columns = ['predicted class “spam”', 'predicted class “ham”'], index= ['true class “spam”', 'true class “ham”'])
print(df)

                   predicted class “spam”  predicted class “ham”
true class “spam”                     100                      0
true class “ham”                        4                     96


## 4(f)

In [44]:
# We train our classifier on the entire dataset (including the validation set) and examine the conditional
# spam probabilities for individual words that our classifier computes during training and print results asked in 4(f)
# We create two dicts based on prob of words being spam i.e. first 3 words in the list with "greater prob on begining" 
# will be tokens as strongest evidance as spam
# and those with lowest prob will be strongest evidance as ham

cls_2 = SpamClassifier(spam_re, ham_re)
prob = cls_2.prob_word_provi_spam
prob_sorted = {k: v for k, v in sorted(prob.items(), key=lambda item: item[1])}
prob_sorted_rever = {k: v for k, v in sorted(prob.items(), key=lambda item: item[1], reverse = True)}

first_3_ham = {k: prob_sorted[k] for k in list(prob_sorted)[:3]}
first_3_spam = {k: prob_sorted_rever[k] for k in list(prob_sorted_rever)[:3]}
print('three words or tokens that are the "strongest" evidence that an email is "spam" are: ', list(first_3_spam.keys()))
print('three words or tokens that are the "strongest" evidence that an email is "ham" are: ', list(first_3_ham.keys()))

first_6_weak_spam = {k: prob_sorted[k] for k in list(prob_sorted)[:6]}
first_6_wek_ham = {k: prob_sorted_rever[k] for k in list(prob_sorted_rever)[:6]}
print('three words or tokens that are the "Wekaest" evidence that an email is "spam" are: ', list(first_6_weak_spam.keys()))
print('three words or tokens that are the "weakest" evidence that an email is "ham" are: ', list(first_6_wek_ham.keys()))

three words or tokens that are the "strongest" evidence that an email is "spam" are:  ['YOU', 'span', 'weight']
three words or tokens that are the "strongest" evidence that an email is "ham" are:  ['rpm', 'zzzlist', 'freshrpms']
three words or tokens that are the "Wekaest" evidence that an email is "spam" are:  ['rpm', 'zzzlist', 'freshrpms', 'egwn', 'auth02', 'EGWN']
three words or tokens that are the "weakest" evidence that an email is "ham" are:  ['YOU', 'span', 'weight', 'crackmice', 'insurance', 'ns']


In [45]:
# Testing the trained classifier using the additional datasets and printing misclassification results
# Datasets are already considered as downloaded

ham_new = get_email_text('20030228_hard_ham.tar.bz2')
spam_new = get_email_text('20030228_spam.tar.bz2')
ham_re_1 = tokenize_ham_spam(ham_new)
spam_re_1 = tokenize_ham_spam(spam_new)
a = b= c= d = 0
for data in spam_re_1:
    if cls_1.check_if_spam(data) is True:
        a +=1
    else:
        b +=1

for data in ham_re_1:
    if cls_1.check_if_spam(data) is True:
        c +=1
    else:
        d +=1
data = [[a, b], [c, d]] 
df = pd.DataFrame(data, columns = ['predicted class “spam”', 'predicted class “ham”'], index= ['true class “spam”', 'true class “ham”'])
print(df)

                   predicted class “spam”  predicted class “ham”
true class “spam”                     500                      1
true class “ham”                      229                     22


In [46]:

# Refernces
# https://stackoverflow.com/questions/7971618/python-return-first-n-keyvalue-pairs-from-dict
# https://stackoverflow.com/questions/15547409/how-to-get-rid-of-punctuation-using-nltk-tokenizer
# https://www.uni-weimar.de/en/media/chairs/computer-science-department/webis/teaching/previous-semesters/ws-201819/machine-learning/
# https://www.guru99.com/python-lambda-function.html#3
# https://blog.softhints.com/python-get-first-elements-dictionary/