Load the data

In [1]:
import glob
import os

In [99]:
emails, labels = [],[]

path_spam = './enron1/spam'

for filename in glob.glob(os.path.join(path_spam,'*.txt')):
    with open(filename,'r',encoding='ISO-8859-1') as f:
        text = f.read()
        emails.append(text)
        labels.append(1)

path_ham = './enron1/ham'

for filename in glob.glob(os.path.join(path_ham,'*.txt')):
    with open(filename,'r',encoding='ISO-8859-1') as f:
        text = f.read()
        emails.append(text)
        labels.append(0)

In [22]:
print(len(emails))

5172


Preprocess the data: 
* Remove names
* Remove stopwords
* Remove numbers and punctuation
* Lemmatization

In [23]:
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer
def letters_only(s):
    return s.isalpha()
all_names = set(names.words())
lemmatizer = WordNetLemmatizer()

In [24]:
def clean_docs(docs):
    cleaned_docs = []
    for doc in docs:
        cleaned_docs.append(' '.join([lemmatizer.lemmatize(word.lower()) for word in doc.split() if letters_only(word) and word not in all_names]))
    return cleaned_docs

In [25]:
clean_mail = clean_docs(emails)

In [26]:
clean_mail[0]

'dobmeos with hgh my energy level ha gone up stukm introducing doctor formulated hgh human growth hormone also called hgh is referred to in medical science a the master hormone it is very plentiful when we are young but near the age of twenty one our body begin to produce le of it by the time we are forty nearly everyone is deficient in hgh and at eighty our production ha normally diminished at least advantage of hgh increased muscle strength loss in body fat increased bone density lower blood pressure quickens wound healing reduces cellulite improved vision wrinkle disappearance increased skin thickness texture increased energy level improved sleep and emotional stability improved memory and mental alertness increased sexual potency resistance to common illness strengthened heart muscle controlled cholesterol controlled mood swing new hair growth and color restore read more at this website unsubscribe'

Creating term frequencies for the cleaned data

In [250]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english', max_features=2000)

In [251]:
term_docs = cv.fit_transform(clean_mail)

In [252]:
clean_mail[0]

'dobmeos with hgh my energy level ha gone up stukm introducing doctor formulated hgh human growth hormone also called hgh is referred to in medical science a the master hormone it is very plentiful when we are young but near the age of twenty one our body begin to produce le of it by the time we are forty nearly everyone is deficient in hgh and at eighty our production ha normally diminished at least advantage of hgh increased muscle strength loss in body fat increased bone density lower blood pressure quickens wound healing reduces cellulite improved vision wrinkle disappearance increased skin thickness texture increased energy level improved sleep and emotional stability improved memory and mental alertness increased sexual potency resistance to common illness strengthened heart muscle controlled cholesterol controlled mood swing new hair growth and color restore read more at this website unsubscribe'

In [253]:
print(term_docs[0])

  (0, 560)	2
  (0, 1019)	2
  (0, 767)	2
  (0, 739)	1
  (0, 491)	1
  (0, 759)	2
  (0, 226)	1
  (0, 1122)	1
  (0, 1104)	1
  (0, 1993)	1
  (0, 1195)	1
  (0, 45)	1
  (0, 184)	2
  (0, 158)	1
  (0, 1003)	1
  (0, 1816)	1
  (0, 1416)	1
  (0, 1219)	1
  (0, 34)	1
  (0, 877)	5
  (0, 1186)	2
  (0, 1061)	1
  (0, 638)	1
  (0, 1069)	1
  (0, 1393)	1
  (0, 1637)	1
  (0, 331)	1
  (0, 788)	1
  (0, 1765)	1
  (0, 1203)	1
  (0, 319)	1
  (0, 1459)	1
  (0, 1944)	1
  (0, 1881)	1


In [254]:
feature_names = cv.get_feature_names_out()
feature_names[:5]

array(['ability', 'able', 'accept', 'acceptance', 'access'], dtype=object)

P(B|A) = P(A|B)*P(B)/P(A)

* P(B) is called the Prior
* P(B|A) is called the posterior
* P(A|B) is called the likelihood
* P(A) is called the evidence

In [255]:
#Calculating the Prior

def get_prior(labels):
    '''prior basically implies the probability of a mail in 
    training set belonging to ham or spam. This is basic probability'''
    len_ham,len_spam = 0,0
    for l in labels:
        if l==0:
            len_ham += 1
        else:
            len_spam += 1
    prior = {0: float(len_ham/len(labels)), 1: float(len_spam/len(labels))}
    return prior

prior = get_prior(labels)
print(prior)

{0: 0.7099767981438515, 1: 0.2900232018561485}


In [256]:
#Calculating the Likelihood

import numpy as np

def get_label_index(labels):
    label_index = {0:[],1:[]}
    for index,l in enumerate(labels):
        if l==0:
            label_index[0].append(index)
        else:
            label_index[1].append(index)
    return label_index

def get_likelihood(label_index,term_docs, laplace_smoothing=0):
    '''likelihood basically states if mail is spam/ham then what is the probabilty that
    this category will contain a particular word
    '''
    likelihood = {}
    for label,index in label_index.items():
        likelihood[label] = term_docs[index,:].sum(axis=0) + laplace_smoothing
        likelihood[label] = np.asarray(likelihood[label])[0]
        total_sum = likelihood[label].sum()
        likelihood[label] = likelihood[label]/float(total_sum)
    return likelihood

label_index = get_label_index(labels)
laplace_smoothing = 1
likelihood = get_likelihood(label_index,term_docs,laplace_smoothing)

In [257]:
#now the final step is to calculate the posterior

def get_posterior(term_docs,prior,likelihood):
    '''we will calculate summation of natural logs of the conditional probabilities
    instead of multiplying them which may cause overflow and then convert the final value'''
    num_docs = term_docs.shape[0]
    posteriors = []
    for i in range(num_docs):
        posterior = {key: np.log(prior_label) for key,prior_label in prior.items()}
        for label, likelihood_label in likelihood.items():
            term_document_vector = term_docs.getrow(i)
            counts = term_document_vector.data
            indices = term_document_vector.indices
            for count,index in zip(counts,indices):
                posterior[label] += np.log(likelihood_label[index])*count
        min_log_posterior = min(posterior.values())
        for label in posterior:
            try:
                posterior[label] = np.exp(posterior[label]-min_log_posterior)
            except:
                posterior[label] = float('inf')
        sum_posterior = sum(posterior.values())
        for label in posterior:
            if posterior[label]==float('inf'):
                posterior[label] = 1.0
            else:
                posterior[label] /= sum_posterior
        posteriors.append(posterior.copy())
    return posteriors

Testing our new model

In [258]:
test = ["""Having problems in bed? We can help! cialis allows men to enjoy a fully normal
        sex lige without having to plant the sexual act.if we let things terrify us, life will not be worth living
        brevity is the soul of lingerie .
        suspicion always haunts the guilty mind ."""]

In [259]:
cleaned_test = clean_docs(test)
terms_test = cv.transform(cleaned_test)
posterior = get_posterior(terms_test,prior,likelihood)
print(posterior)

[{0: 1.336945845936172e-10, 1: 0.9999999998663054}]


Accuracy metrics for the model
* Split the data into test-train sets and evaluate the model 

In [260]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(clean_mail,labels,test_size=0.33,random_state=42)

In [261]:
term_docs_train = cv.fit_transform(x_train)
label_index_train = get_label_index(y_train)
prior = get_prior(label_index)
likelihood = get_likelihood(label_index_train,term_docs_train, laplace_smoothing)

In [262]:
term_docs_test = cv.transform(x_test)
posterior = get_posterior(term_docs_test, prior, likelihood)

  posterior[label] = np.exp(posterior[label]-min_log_posterior)


In [263]:
correct = 0.0
for pred, actual in zip(posterior, y_test):
    if actual == 1:
        if pred[1] >= 0.5:
            correct += 1
    elif pred[0] > 0.5:
        correct += 1
print('The accuracy on {0} testing samples is:{1:.1f}%'.format(len(y_test), correct/len(y_test)*100))

The accuracy on 1707 testing samples is:95.3%
