In [1]:
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
from pycm import *


In [2]:
def readMatrix(file):
    fd = open(file, 'r')
    hdr = fd.readline()
    rows, cols = [int(s) for s in fd.readline().strip().split()]
    tokens = fd.readline().strip().split()
    matrix = np.zeros((rows, cols))
    Y = []
    for i, line in enumerate(fd):
        nums = [int(x) for x in line.strip().split()]
        Y.append(nums[0])
        kv = np.array(nums[1:])
        k = np.cumsum(kv[:-1:2])
        v = kv[1::2]
        matrix[i, k] = v
    return matrix, tokens, np.array(Y)



In [3]:
def nb_train(matrix, category):
    state = {}
    N = matrix.shape[1]
    

    spam = matrix[category == 1, :]
    ham = matrix[category == 0, :]

    spam_lengths = spam.sum(axis = 1)
    ham_lengths = ham.sum(axis = 1)

    state['phi_spam'] = (spam.sum(axis=0) + 1) / (np.sum(spam_lengths) + N)
    state['phi_ham'] = (ham.sum(axis = 0) + 1) / (np.sum(ham_lengths) + N)
    state['phi'] = spam.shape[0]/(spam.shape[0]+ham.shape[0])
   
    return state




In [4]:
def nb_test(matrix, state):
    output = np.zeros(matrix.shape[0])
   
    log_phi_spam = np.sum(np.log(state['phi_spam'])*matrix, axis = 1)
    log_phi_ham=np.sum(np.log(state['phi_ham'])*matrix, axis = 1)
    phi = state['phi']

    ratio=np.exp(log_phi_ham+np.log(1 - phi) - log_phi_spam - np.log(phi))
    probs = 1/(1+ratio)

    output[probs > 0.5] = 1

    
    return output



In [6]:
trainMatrix, tokenlist, trainCategory = readMatrix('MATRIX.TRAIN.800')
testMatrix, tokenlist, testCategory = readMatrix('MATRIX.TEST')
state = nb_train(trainMatrix, trainCategory)
output = nb_test(testMatrix, state)


Confusion Matrix:
Predict          0    1    
Actual
0                397  3    
1                11   389  

None
Accuracy:  0.9825
Precision:  0.9923469387755102
Recall:  0.9725
Error: 0.0175




In [7]:
from sklearn.metrics import confusion_matrix
confusion_matrix(testCategory, output)

array([[397,   3],
       [ 11, 389]])

In [8]:
from sklearn.metrics import recall_score
recall_=recall_score(testCategory, output)
print('Average precision-recall score: {0:0.2f}'.format(
      recall_))

Average precision-recall score: 0.97


In [11]:
from sklearn.metrics import average_precision_score
average_precision = average_precision_score(testCategory,output)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))

Average precision-recall score: 0.98
