In [1]:
import sys
import os.path
import numpy as np
from collections import Counter
import util

USAGE = "%s <test data folder> <spam folder> <ham folder>"

def get_counts(file_list):
    """
    Computes counts for each word that occurs in the files in file_list.

    Inputs
    ------
    file_list : a list of filenames, suitable for use with open() or 
                util.get_words_in_file()

    Output
    ------
    A dict whose keys are words, and whose values are the number of files the
    key occurred in.
    """
    ### TODO: Comment out the following line and write your code here
    counter=Counter()
    total_text=[]
    a=0
    for text in util.get_files_in_folder(file_list): 
        try:  
            counter.update(set(util.get_words_in_file(text)))
        except:
                continue
    print("dict finish: ",file_list)
  
            
    return counter
print("The number of text in spam data: ",get_counts("spam").most_common()[0][1])
print("The number of text in ham data: ",get_counts("ham").most_common()[0][1])

dict finish:  spam
The number of text in spam data:  3369
dict finish:  ham
The number of text in ham data:  1500


In [2]:
#spam  >3369

In [3]:
#ham   >1500

In [4]:
import numpy as np
def get_probabilities(file_list):
    """
    Computes log-frequencies for each word that occurs in the files in 
    file_list.

    Input
    -----
    file_list : a list of filenames, suitable for use with open() or 
                util.get_words_in_file()

    Output
    ------
    A dict whose keys are words, and whose values are the log of the smoothed
    estimate of the fraction of files the key occurred in.

    Hint
    ----
    The data structure util.DefaultDict will be useful to you here, as will the
    get_counts() helper above.
    """
    ### TODO: Comment out the following line and write your code here
    new_dict=get_counts(file_list)
    total_count= new_dict.most_common()[0][1]
    for voc in new_dict:
        new_dict[voc]=((new_dict[voc]+1)/(total_count+2))
    return new_dict 

In [5]:
def learn_distributions(file_list):
    """
    Input
    -----
    A two-element list. The first element is a list of spam files, 
    and the second element is a list of ham (non-spam) files.

    Output
    ------
    (log_probabilities_by_category, log_prior)

    log_probabilities_by_category : A list whose first element is a smoothed
                                    estimate for log P(y=w_j|c=spam) (as a dict,
                                    just as in get_log_probabilities above), and
                                    whose second element is the same for c=ham.

    log_prior_by_category : A list of estimates for the log-probabilities for
                            each class:
                            [est. for log P(c=spam), est. for log P(c=ham)]
    """
    ### TODO: Comment out the following line and write your code here
    probabilities_by_category=[]
    probabilities_by_category.append(get_probabilities(file_list[0]))
    probabilities_by_category.append(get_probabilities(file_list[1]))
    prior_by_category=[]
    spam=3369
    ham=1500
    prior_by_category.append(spam/(spam+ham))
    prior_by_category.append(ham/(spam+ham))
    return (probabilities_by_category,prior_by_category)

In [6]:
def classify_email(email_filename,
                   probabilities_by_category,
                   prior_by_category):
    """
    Uses Naive Bayes classification to classify the email in the given file.

    Inputs
    ------
    email_filename : name of the file containing the email to be classified

    log_probabilities_by_category : See output of learn_distributions

    log_prior_by_category : See output of learn_distributions

    Output
    ------
    One of the labels in names.
    
    """
    spam_prior=prior_by_category[0]
    ham_prior= prior_by_category[1]
    file =util.get_words_in_file(email_filename)
    spam=1
    ham=1
    for voc in file:
        if voc in probabilities_by_category[0].keys():
            spam*= probabilities_by_category[0][voc]
        else:
            spam*= (1/(3369+2))
    for voc in file:
        if voc in probabilities_by_category[1].keys():
            ham*= probabilities_by_category[1][voc]
        else:
            ham*= (1/(1500+2))   
    prob_ham= (ham_prior*ham)
    prob_spam= (ham_prior*spam)
    ### TODO: Comment out the following line and write your code here
    if prob_spam>prob_ham:
        return 'spam'
    else:
        return 'ham'

In [7]:
import re
from sklearn.metrics import confusion_matrix
def main():

    ### Learn the distributions
    file_lists = ["spam","ham"]
    (probabilities_by_category, priors_by_category)=learn_distributions(file_lists)
    ### Classify and measure performance
    a=0
    c=0
    y_true=[]
    y_pred=[]
    for filename in util.get_files_in_folder("testing"):
        ## Classify
        a+=1
        label = classify_email(filename,
                               probabilities_by_category,
                               priors_by_category)
        y_true.append(label)
        y_pred.append(re.findall(r'spam|ham',filename))
        
        if label in filename:
            c+=1
        # Uncomment this line to see which files your classifier
        # gets right/wrong:
        #print("%s : %s" %(label, filename))
    print("There are %d emails in test set."%(a))
    print("correct rate = ",c/a)
    print(confusion_matrix(y_true, y_pred, labels=["spam","ham"]))

In [8]:
if __name__ == '__main__':
    main()

dict finish:  spam
dict finish:  ham
There are 100 emails in test set.
correct rate =  0.82
[[33  2]
 [16 49]]
