In [25]:
import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.metrics import accuracy_score

In [11]:
def make_Dictionary(root_dir):
    all_words = []
    emails = [os.path.join(root_dir,f) for f in os.listdir(root_dir)]
    for mail in emails:
        with open(mail) as m:
            for line in m:
                words = line.split()
                all_words += words
    dictionary = Counter(all_words)
    list_to_remove = dictionary.keys()

    for item in list(list_to_remove):
        if item.isalpha() == False:
            del dictionary[item]
        elif len(item) == 1:
            del dictionary[item]
    dictionary = dictionary.most_common(3000)

    return dictionary

In [12]:
def extract_features(mail_dir):
    files = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)]
    features_matrix = np.zeros((len(files),3000))
    train_labels = np.zeros(len(files))
    count = 0;
    docID = 0;
    for fil in files:
      with open(fil) as fi:
        for i,line in enumerate(fi):
          if i == 2:
            words = line.split()
            for word in words:
              wordID = 0
              for i,d in enumerate(dictionary):
                if d[0] == word:
                  wordID = i
                  features_matrix[docID,wordID] = words.count(word)
        train_labels[docID] = 0;
        filepathTokens = fil.split('/')
        lastToken = filepathTokens[len(filepathTokens) - 1]
        if lastToken.startswith("spmsg"):
            train_labels[docID] = 1;
            count = count + 1
        docID = docID + 1
    return features_matrix, train_labels

In [13]:
TRAIN_DIR = "train-mails"
TEST_DIR = "test-mails"

In [14]:
dictionary = make_Dictionary(TRAIN_DIR)

In [16]:
print("reading and processing emails from file.")
features_matrix, labels = extract_features(TRAIN_DIR)
test_feature_matrix, test_labels = extract_features(TEST_DIR)

reading and processing emails from file.


# Naive Bayes Classifier

In [20]:
model = GaussianNB()

In [21]:
print("Training model.")
#train model
model.fit(features_matrix, labels)

Training model.


GaussianNB(priors=None, var_smoothing=1e-09)

In [22]:
predicted_labels = model.predict(test_feature_matrix)

In [24]:
print("FINISHED classifying. accuracy score : ")
print(accuracy_score(test_labels, predicted_labels))

FINISHED classifying. accuracy score : 
0.9653846153846154


# Support Vector Classifier

In [38]:
model = svm.SVC()

In [39]:
print("Training model.")
#train model
model.fit(features_matrix, labels)
predicted_labels = model.predict(test_feature_matrix)
print("FINISHED classifying. accuracy score : ")
print(accuracy_score(test_labels, predicted_labels))

Training model.


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

FINISHED classifying. accuracy score : 
0.8153846153846154


In [40]:
model = svm.SVC(kernel="rbf", C=10000)

In [41]:
print("Training model.")
#train model
model.fit(features_matrix, labels)
predicted_labels = model.predict(test_feature_matrix)
print("FINISHED classifying. accuracy score : ")
print(accuracy_score(test_labels, predicted_labels))

Training model.


SVC(C=10000, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

FINISHED classifying. accuracy score : 
0.9538461538461539


In [None]:
model = svm.SVC(kernel="rbf", C=100, gamma=0.001)

In [42]:
print("Training model.")
#train model
model.fit(features_matrix, labels)
predicted_labels = model.predict(test_feature_matrix)
print("FINISHED classifying. accuracy score : ")
print(accuracy_score(test_labels, predicted_labels))

Training model.


SVC(C=10000, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

FINISHED classifying. accuracy score : 
0.9538461538461539
