Load the data

In [1]:
import glob
import os

In [99]:
emails, labels = [],[]

path_spam = './enron1/spam'

for filename in glob.glob(os.path.join(path_spam,'*.txt')):
    with open(filename,'r',encoding='ISO-8859-1') as f:
        text = f.read()
        emails.append(text)
        labels.append(1)

path_ham = './enron1/ham'

for filename in glob.glob(os.path.join(path_ham,'*.txt')):
    with open(filename,'r',encoding='ISO-8859-1') as f:
        text = f.read()
        emails.append(text)
        labels.append(0)

In [22]:
print(len(emails))

5172


Preprocess the data: 
* Remove names
* Remove stopwords
* Remove numbers and punctuation
* Lemmatization

In [23]:
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer
def letters_only(s):
    return s.isalpha()
all_names = set(names.words())
lemmatizer = WordNetLemmatizer()

In [24]:
def clean_docs(docs):
    cleaned_docs = []
    for doc in docs:
        cleaned_docs.append(' '.join([lemmatizer.lemmatize(word.lower()) for word in doc.split() if letters_only(word) and word not in all_names]))
    return cleaned_docs

In [25]:
clean_mail = clean_docs(emails)

In [26]:
clean_mail[0]

'dobmeos with hgh my energy level ha gone up stukm introducing doctor formulated hgh human growth hormone also called hgh is referred to in medical science a the master hormone it is very plentiful when we are young but near the age of twenty one our body begin to produce le of it by the time we are forty nearly everyone is deficient in hgh and at eighty our production ha normally diminished at least advantage of hgh increased muscle strength loss in body fat increased bone density lower blood pressure quickens wound healing reduces cellulite improved vision wrinkle disappearance increased skin thickness texture increased energy level improved sleep and emotional stability improved memory and mental alertness increased sexual potency resistance to common illness strengthened heart muscle controlled cholesterol controlled mood swing new hair growth and color restore read more at this website unsubscribe'

Creating term frequencies for the cleaned data

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english', max_features=1000)

In [28]:
term_docs = cv.fit_transform(clean_mail)

In [36]:
clean_mail[0]

'dobmeos with hgh my energy level ha gone up stukm introducing doctor formulated hgh human growth hormone also called hgh is referred to in medical science a the master hormone it is very plentiful when we are young but near the age of twenty one our body begin to produce le of it by the time we are forty nearly everyone is deficient in hgh and at eighty our production ha normally diminished at least advantage of hgh increased muscle strength loss in body fat increased bone density lower blood pressure quickens wound healing reduces cellulite improved vision wrinkle disappearance increased skin thickness texture increased energy level improved sleep and emotional stability improved memory and mental alertness increased sexual potency resistance to common illness strengthened heart muscle controlled cholesterol controlled mood swing new hair growth and color restore read more at this website unsubscribe'

In [50]:
print(term_docs[0])

  (0, 265)	2
  (0, 482)	2
  (0, 363)	2
  (0, 229)	1
  (0, 361)	2
  (0, 103)	1
  (0, 997)	1
  (0, 86)	2
  (0, 72)	1
  (0, 476)	1
  (0, 898)	1
  (0, 690)	1
  (0, 506)	1
  (0, 865)	1
  (0, 585)	1
  (0, 151)	1
  (0, 714)	1
  (0, 969)	1
  (0, 933)	1


In [54]:
feature_names = cv.get_feature_names_out()
feature_names[:5]

array(['ability', 'able', 'access', 'according', 'account'], dtype=object)

P(B|A) = P(A|B)*P(B)/P(A)

* P(B) is called the Prior
* P(B|A) is called the posterior
* P(A|B) is called the likelihood
* P(A) is called the evidence

In [49]:
#Calculating the Prior

def get_prior(labels):
    '''prior basically implies the probability of a mail in 
    training set belonging to ham or spam. This is basic probability'''
    len_ham,len_spam = 0,0
    for l in labels:
        if l==0:
            len_ham += 1
        else:
            len_spam += 1
    prior = {0: float(len_ham/len(labels)), 1: float(len_spam/len(labels))}
    return prior

prior = get_prior(labels)
print(prior)

{0: 0.7099767981438515, 1: 0.2900232018561485}


In [105]:
#Calculating the Likelihood

import numpy as np

def label_index(labels):
    label_index = {0:[],1:[]}
    for index,l in enumerate(labels):
        if l==0:
            label_index[0].append(index)
        else:
            label_index[1].append(index)
    return label_index

def get_likelihood(label_index,term_docs, laplace_smoothing=0):
    '''likelihood basically states if mail is spam/ham then what is the probabilty that
    this category will contain a particular word
    '''
    likelihood = {}
    for label,index in label_index.items():
        likelihood[label] = term_docs[index,:].sum(axis=0) + laplace_smoothing
        likelihood[label] = np.asarray(likelihood[label])[0]
        total_sum = likelihood[label].sum()
        likelihood[label] = likelihood[label]/float(total_sum)
    return likelihood

label_index = label_index(labels)
laplace_smoothing = 1
likelihood = get_likelihood(label_index,term_docs,laplace_smoothing)

In [136]:
#now the final step is to calculate the posterior

def get_posterior(term_docs,prior,likelihood):
    '''we will calculate summation of natural logs of the conditional probabilities
    instead of multiplying them which may cause overflow and then convert the final value'''
    num_docs = term_docs.shape[0]
    posteriors = []
    for i in range(num_docs):
        posterior = {key: np.log(prior_label) for key,prior_label in prior.items()}
        for label, likelihood_label in likelihood.items():
            term_document_vector = term_docs.getrow(i)
            counts = term_document_vector.data
            indices = term_document_vector.indices
            for count,index in zip(counts,indices):
                posterior[label] += np.log(likelihood_label[index])*count
            min_log_posterior = min(posterior.values())
            for label in posterior:
                try:
                    posterior[label] = np.exp(posterior[label]-min_log_posterior)
                except:
                    posterior[label] = float('inf')
            sum_posterior = sum(posterior.values())
            for label in posterior:
                if posterior[label]==float('inf'):
                    posterior[label] = 1.0
                else:
                    posterior[label] /= sum_posterior
        posteriors.append(posterior.copy())
    return posteriors

In [139]:
test = ["""Hi Sukhman,

This is going to be a bit of a different email than normal since I want to talk all about New Years and the idea of New Year's resolutions. To be honest almost everyone does New Year's resolutions wrong (including you) which is why most people fail at sticking to their resolutions and meaningfully changing their lives. In this article I want to talk about why most people will fail their New Year's resolutions, and what you can do to actually succeed """]

In [140]:
cleaned_test = clean_docs(test)
terms_test = cv.transform(cleaned_test)
posterior = get_posterior(terms_test,prior,likelihood)
print(posterior)

[{0: 1.0, 1: 4.212294095348821e-46}]
