In [1]:
import numpy as np
import pandas as pd
import random
from string import punctuation
from collections import OrderedDict

In [2]:
def load(path):
    df = None
    df=pd.read_csv("TRAIN_balanced_ham_spam.csv")
    return df

In [3]:
def prior(df):
    ham_prior = 0
    spam_prior =  0
    ham_prior=df["label"].value_counts()[0]/df.shape[0]
    spam_prior=df["label"].value_counts()[1]/df.shape[0]
    
    return ham_prior, spam_prior

In [4]:
def likelihood(df):
    ham_like_dict = {}
    spam_like_dict = {}
    for i in range(df["label"].value_counts()[0]):
        content=df.iloc[i,3].split()
        #make sure each word only be counted one time per email
        email=list(OrderedDict.fromkeys(content))  
        for word in email:
            if word.lower() not in ham_like_dict and word not in punctuation:
                ham_like_dict[word] = 1
            elif word.lower() in ham_like_dict:
                ham_like_dict[word] = ham_like_dict.get(word) + 1
                
    # get item in dictionary as likelihood probability for ham dictiionary
    for word in ham_like_dict:  
        ham_like_dict[word]=ham_like_dict.get(word)/df["label"].value_counts()[0]
    
    for j in range(df["label"].value_counts()[0],df.shape[0]):
        content=df.iloc[j,3].split()
        email=list(OrderedDict.fromkeys(content))
        for word in email:
            if word.lower() not in spam_like_dict and word not in punctuation:
                spam_like_dict[word] = 1
            elif word.lower() in spam_like_dict:
                spam_like_dict[word] = spam_like_dict.get(word) + 1    
    
    for word in spam_like_dict:
        spam_like_dict[word]=spam_like_dict.get(word)/df["label"].value_counts()[0]
    
          
        
                
    return ham_like_dict, spam_like_dict
            

In [5]:
def predict(ham_prior, spam_prior, ham_like_dict, spam_like_dict, text):
    '''
    prediction function that uses prior and likelihood structure to compute proportional posterior for a single line of text
    '''
    text=text.split() # split word string in to a list of seperate words
    ham_likelihood=0
    f_list=["for","to","the","on","a","and","you","is","this","of","i","with","this"] #word list that words are too frequently appear
    #caculate the log likelihood of given class=ham
    for word in text:
        if word in ham_like_dict and word not in f_list:
            ham_likelihood =ham_likelihood+ np.log(ham_like_dict.get(word))
        else:
            ham_likelihood =ham_likelihood+ np.log(0.0001)     #mupltiply by a very samll probabiliyt for ward not appearing 
    #caculate the log likelihood of given class=spam
    spam_likelihood=0
 
    for word in text:
        if word in spam_like_dict and word not in f_list:
            spam_likelihood =spam_likelihood+ np.log(spam_like_dict.get(word))
        else:
            spam_likelihood =spam_likelihood+ np.log(0.0001)
    #ham_spam_decision = 1 if classified as spam, 0 if classified as normal/ham
    ham_spam_decision = None
    
    
    #ham_posterior = posterior probability that the email is normal/ham

    ham_posterior = None
    #spam_posterior = posterior probability that the email is spam

    spam_posterior = None
    ham_posterior = ham_likelihood+np.log(ham_prior)
    spam_posterior = spam_likelihood+np.log(spam_prior)
    if ham_posterior>= spam_posterior:
        ham_spam_decision=0
    else:
        ham_spam_decision=1
    
    return ham_spam_decision

In [6]:
def metrics(ham_prior, spam_prior, ham_dict, spam_dict, df):
    '''
    Calls "predict"
    '''
    hh = 0 #true negatives, truth = ham, predicted = ham
    hs = 0 #false positives, truth = ham, pred = spam
    sh = 0 #false negatives, truth = spam, pred = ham
    ss = 0 #true positives, truth = spam, pred = spam
    num_rows = df.shape[0]
    for i in range(num_rows):
        roi = df.iloc[i,:]
        roi_text = roi.text
        roi_label = roi.label_num
        guess = predict(ham_prior, spam_prior, ham_dict, spam_dict, roi_text)
        if roi_label == 0 and guess == 0:
            hh += 1
        elif roi_label == 0 and guess == 1:
            hs += 1
        elif roi_label == 1 and guess == 0:
            sh += 1
        elif roi_label == 1 and guess == 1:
            ss += 1
    
    acc = (ss + hh)/(ss+hh+sh+hs)
    precision = (ss)/(ss + hs)
    recall = (ss)/(ss + sh)
    return acc, precision, recall

In [7]:
if __name__ == "__main__":
    df=load(1)
    ham_prior, spam_prior=prior(df)
    ham_dict, spam_dict=likelihood(df)
    test_df=pd.read_csv("TEST_balanced_ham_spam.csv")
    print(metrics(ham_prior, spam_prior, ham_dict, spam_dict,test_df))
	#this cell is for your own testing of the functions above


(0.9783333333333334, 0.9644012944983819, 0.9933333333333333)


In [8]:
#sorted(a.items(), key=lambda x: x[1], reverse=True) a is a dictionary and I arrange the dictionary in descending order
#sorted(b.items(), key=lambda x: x[1], reverse=True)