In [1]:
import os
import numpy as np

# Load file names from EmailsData folder
def get_dataset() :
    spam_train_files = sorted(os.listdir("EmailsData/spam-train/"))
    nonspam_train_files = sorted(os.listdir("EmailsData/nonspam-train/"))
    spam_test_files = sorted(os.listdir("EmailsData/spam-test/"))
    nonspam_test_files = sorted(os.listdir("EmailsData/nonspam-test/"))
    
    for i in xrange(len(spam_train_files)):
        spam_train_files[i] = os.path.join("EmailsData/spam-train/",spam_train_files[i])

    for i in xrange(len(spam_test_files)):
        spam_test_files[i] = os.path.join("EmailsData/spam-test/",spam_test_files[i])

    for i in xrange(len(nonspam_train_files)):
        nonspam_train_files[i] = os.path.join("EmailsData/nonspam-train/",nonspam_train_files[i])    

    for i in xrange(len(nonspam_test_files)):
        nonspam_test_files[i] = os.path.join("EmailsData/nonspam-test/",nonspam_test_files[i])
        
    return spam_train_files, nonspam_train_files, spam_test_files, nonspam_test_files

In [2]:
# Calculate accuracy
def get_accuracy(predicted_test_y, test_y) :
    count = 0
    for i in xrange(len(test_y)) :
        if predicted_test_y[i] == test_y[i] :
            count += 1
    return count, float(count)/float(len(test_y))

# Calculate F1 score
def get_F1_score(cf_matrix):
    return float(2*cf_matrix[1][1])/(2*cf_matrix[1][1] + cf_matrix[0][1] + cf_matrix[1][0])

# Calculate confusion matrix
def get_confusion_matrix(predicted, act):
    cf_matrix = [[0, 0], [0, 0]]
    for i in xrange(len(predicted)):
        cf_matrix[int(act[i])][int(predicted[i])] += 1
    return cf_matrix

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Initialize TldifVectorizer and SelectKBest
vectorizer = TfidfVectorizer(input='filename')
features_selector = SelectKBest(k=50, score_func=mutual_info_classif)

# Calcuate probability density thet this x with be generated by Gaussian with given mean  and variance
# Log of pdf is calculated and constants are ignored as it will be common to all
def normpdf(x, mean, var):
    return -(float(x)-float(mean))**2/(2*var) - 0.5*np.log(var)

# Calculate mean and variance for all features of spam and non spam
def calc_mean_variance(feat_mat) :
    feat_mat_arr = np.vsplit(feat_mat, 2)
    feat_mat_spam = feat_mat_arr[0]
    feat_mat_nonspam = feat_mat_arr[1]
    
    # Add 10e-12 to avoid variance to be zero
    mean_spam = np.mean(feat_mat_spam, axis=0)
    var_spam = np.var(feat_mat_spam, axis=0)
    var_spam += 10e-12
    
    mean_nonspam = np.mean(feat_mat_nonspam, axis=0)
    var_nonspam = np.var(feat_mat_nonspam, axis=0)
    var_nonspam += 10e-12
    
    # Return means and varinces of spam and nonspam
    return [mean_spam, var_spam, mean_nonspam, var_nonspam]

# Train the dataset by calculate means and variances
def train_nb(train_files, train_y) :
    tfidf_train = vectorizer.fit_transform(train_files)
    features_train = features_selector.fit_transform(tfidf_train.toarray(), train_y)
    return features_train, calc_mean_variance(features_train)

# Test on test files with calculated means varinaces
def test_nb(test_files, mean_var) :
    mean_spam = mean_var[0]
    var_spam = mean_var[1]
    mean_nonspam = mean_var[2]
    var_nonspam = mean_var[3]
    tfidf_test = vectorizer.transform(test_files)
    features_test = features_selector.transform(tfidf_test.toarray())
    predicted_y = []
    
    # For each row calcualte probabailty of spam and nonspam and decide according to whichever probability is higher
    # Log of probabilities is calculated to avoid underflow
    for row in features_test :
        probS = 0.0
        # Calculate that this is spam given all features
        for i in xrange(len(row)):
            probS += normpdf(row[i], mean_spam[i], var_spam[i])
        probNS = 0.0
        # Calculate that this is not  given all features
        for i in xrange(len(row)):
            probNS += normpdf(row[i], mean_nonspam[i], var_nonspam[i])
            
        # Predict based on probabilities
        if probS > probNS:
            predicted_y.append(1)
        elif probS < probNS:
            predicted_y.append(0)
        else:
            predicted_y.append(np.random.randint(0, 2))
    # Return predicted output
    return features_test, predicted_y

In [6]:
# Load dataset
spam_train_files, nonspam_train_files, spam_test_files, nonspam_test_files = get_dataset()

# Sepaerate X and y from train set
train_files_x = spam_train_files + nonspam_train_files
train_y = np.concatenate((np.ones(len(spam_train_files), dtype=np.int), np.zeros(len(nonspam_train_files), dtype=np.int)), 0)

# Train on train set
features_train, mean_var = train_nb(train_files_x, train_y) 

In [7]:
# Seperate X and y from test set
test_files_x = spam_test_files + nonspam_test_files
test_y = np.concatenate((np.ones(len(spam_test_files), dtype=np.int), np.zeros(len(nonspam_test_files), dtype=np.int)), 0)

# Predict on test set
features_test, predicted_y = test_nb(test_files_x, mean_var)

# Calculate accuracy
_, accuarcy = get_accuracy(predicted_y, test_y)

print "Accuracy : ", accuarcy
print "Confusion matrix : "
cf_matrix = get_confusion_matrix(predicted_y, test_y)
print cf_matrix[0][0], " ", cf_matrix[0][1]
print cf_matrix[1][0], " ", cf_matrix[1][1]
f1_score = get_F1_score(cf_matrix)
print "F1 score : ", f1_score

Accuracy :  0.976923076923
Confusion matrix : 
124   6
0   130
F1 score :  0.977443609023


In [8]:
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(features_train, train_y)
sklearn_test_y = gnb.predict(features_test)

_, accuracy = get_accuracy(sklearn_test_y, test_y)

print "Accuracy : ", accuarcy
print "Confusion matrix : "
cf_matrix = get_confusion_matrix(predicted_y, test_y)
print cf_matrix[0][0], " ", cf_matrix[0][1]
print cf_matrix[1][0], " ", cf_matrix[1][1]
f1_score = get_F1_score(cf_matrix)
print "F1 score : ", f1_score

Accuracy :  0.976923076923
Confusion matrix : 
124   6
0   130
F1 score :  0.977443609023


## Report

* For both my implementation and sklearn implementation results are same and they are :
    * Accuracy :  0.976923076923
    * Confusion matrix : 
        
| | Predicted Not Spam | Predicted Spam |
| --- | --- | --- |
| Actual Not Spam | 124 | 6 |
| Actual Spam| 0 | 130 |
       
    * F1 score :  0.977443609023