In [53]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC, NuSVC, LinearSVC

from IPython.display import Image
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline  

# Making dictionary from emails

In [46]:
def make_dictionary(train_dir):
    emails = [os.path.join(train_dir,f) for f in os.listdir(train_dir)] 
    all_words = []
    
    for mail in emails:
        with open(mail) as m:
            for i, line in enumerate(m):
                if(i == 2):   # only 3 rd line is message in our text file
                    words = line.split()
                    all_words += words
                    
    dictionary = Counter(all_words)
    
#   Remove all not alphabetical words and punctuations
    removable_keys = []
    list_to_remove = dictionary.keys()
    for item in list_to_remove:
        if item.isalpha() == False: 
            removable_keys.append(item)
        elif len(item) == 1:
            removable_keys.append(item)
    
    print(len(removable_keys))
            
    for item in removable_keys:
        del dictionary[item]
        
    
#   Extract most common 3000 words
    dictionary = dictionary.most_common(3000)
    
    return dictionary

# Feature Extraction

In [47]:
def extract_features(mail_dir, dictionary):
    emails = [os.path.join(mail_dir, f) for f in os.listdir(mail_dir)]
    
#   Feature matrix shape = (emails, 3000)
    features_matrix = np.zeros((len(emails), 3000))
    
    emailID = 0
    for mail in emails:
        with open(mail) as m:
            for i, line in enumerate(m):
                if i==2:
                    words = line.split()
                    
                    for word in words:
                        wordID = 0
                        
                        for i, d in enumerate(dictionary):
                            if d[0] == word:
                                wordID = i
                                features_matrix[emailID, wordID] = words.count(word)
                                
            emailID = emailID + 1
            
    return features_matrix

In [49]:
train_dir = 'train-mails'
dictionary = make_dictionary(train_dir)
print(len(dictionary))
for i in range(10):
    print(dictionary[i])

3605
3000
('order', 1414)
('address', 1293)
('report', 1216)
('mail', 1127)
('send', 1079)
('language', 1072)
('email', 1051)
('program', 1001)
('our', 987)
('list', 935)


# feature extraction and email labeling

In [50]:
train_labels = np.zeros(702)
train_labels[351:701] = 1
train_matrix = extract_features(train_dir, dictionary)

In [51]:
print(train_matrix.shape)

(702, 3000)


# Train SVM and Naive bayes classifier

In [54]:
model1 = MultinomialNB()
model2 = LinearSVC()
model1.fit(train_matrix,train_labels)
model2.fit(train_matrix,train_labels)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

# Feature extraction of test mails

In [55]:
test_dir = 'test-mails'
test_matrix = extract_features(test_dir, dictionary)
test_labels = np.zeros(260)
test_labels[130:260] = 1

In [56]:
print(test_matrix.shape)

(260, 3000)


# Results

In [57]:
result1 = model1.predict(test_matrix)
result2 = model2.predict(test_matrix)

In [62]:
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

In [63]:
print(confusion_matrix(test_labels, result1))

[[129   1]
 [  9 121]]


In [64]:
print(confusion_matrix(test_labels, result2))

[[126   4]
 [  6 124]]


In [66]:
print(accuracy_score(test_labels, result1))

0.9615384615384616


In [67]:
print(accuracy_score(test_labels, result2))

0.9615384615384616
