In [338]:
import numpy as np
import pandas
import math

def loadData(filename):
    data = pandas.read_csv(filename)
    train = data.sample(frac = 0.8)
    test = data.drop(train.index)
    test = test.reindex(np.random.permutation(test.index))
    train = train.applymap(lambda x : 1.0 if x > 0 else 0.0)
    test =test.applymap(lambda x : 1.0 if x > 0 else 0.0)
    return [train,test]

def populateWordCountVectors(train):
    spam = train[train['1'] == 1]
    ham = train[train['1'] == 0]
    wordcount_spam = spam.sum()
    wordcount_ham = ham.sum()
    spam_count = spam.shape[0]
    ham_count = ham.shape[0]
    total = spam_count + ham_count
    return [wordcount_spam / spam_count, wordcount_ham / ham_count, float(spam_count) / total, float(ham_count) / total]    

def populateSpamliness(W_S, W_H, spam_prob, ham_prob):
    words = np.array([0.0] * 58)
    for i in range(len(words)):
        #Bayes Theorem
        term1 = W_S[i] * spam_prob
        term2 = W_H[i] * ham_prob
        words[i] = term1 / (term1 + term2)
    return words

def populateModel(train):
    W_S , W_H, spam_prob, ham_prob = populateWordCountVectors(train)
    S_W = populateSpamliness(W_S, W_H, spam_prob, ham_prob)
    return S_W

def calculatePrecision(test, S_W):
    correct = 0
    wrong = 0
    predictions = test.apply(lambda row: predict(row),axis=1)
    actual_values = test.apply(lambda row: True if row['1']==1.0 else False, axis=1)
    equality = (predictions == actual_values)
    correct = equality[equality==True].shape[0]
    return float(correct)/ test.shape[0]

def predict(vector): 
    sum = 0.0
    for i in range(48):
        if(vector.iloc[i] == 1):
            term1 = math.log(1-S_W[i])
            term2 = math.log(S_W[i])
            sum += (term1-term2)
    prob = 1.0/(1+ np.e ** sum)
    prediction = prob > THRES
    return prediction   

THRES = 0.85
train, test = loadData('spambase/spambase.data')
S_W = populateModel(train)
precision = calculatePrecision(test, S_W)
print precision
