In [50]:
import os, random, csv, sys

limit = sys.maxsize
while True:
    try:
        csv.field_size_limit(limit)
        break
    except OverflowError:
        limit = int(limit/2)

In [90]:
class NaiveBayesClassifier():

    def __init__(self, alpha=1):
        ## Used for smoothing when we encounter a word we have not seen before
        self._alpha = alpha
        
        ## Establishing probability variables
        self._probSpam = -1
        self._probNotSpam = -1
        
        ## Establishing dictionaries to keep count of spam and ham word occurences
        self._spamWords = dict()
        self._hamWords = dict()
        
        ## Establishing variables to keep track of spam, ham, and total words
        self._totalSpamWords = 0
        self._totalHamWords = 0
        self._totalWords = 0
        

    ### Helper functions ###

    # Get the dictionary value for word
    def _getWordOccurences(self, word, d):
        if word in d.keys():
            return d[word] + self._alpha
        else:
            return self._alpha
    
    # Generator for getting all words in an email
    def _getEmailWords(self, email):
        for word in email[1].split():
            if word.isalpha():
                yield word.lower()
#             elif 'www.' in word:
#                 yield '_URL_'            
        for word in email[2].split():
            if word.isalpha():
                yield word.lower()
#             elif word.isdigit():
#                 yield '_digit_'
#             elif 'www.' in word:
#                 yield '_URL_'
        return

    # Retrieves the text of an email
    def _getEmail(self, email):
        return email[2]

    # Determines whether or not a file is spam based on the file name
    def _emailIsSpam(self, email):
        return email[3] == '1'

    # Adds a word to the dictionary
    # If the word is not in the dictionary then initialize it with a value of one
    def _addWord(self, word, d):
        if word in d.keys():
            d[word] += 1
        else:
            d[word] = 1
        
        
    ## Trains classifier with the given dataSet 
    def train(self, dataSet):
        totalEmails = len(dataSet)
        spamEmails = 0
        for email in dataSet:
            isSpam = self._emailIsSpam(email)
            if isSpam:
                spamEmails += 1
            for word in self._getEmailWords(email):
                self._totalWords += 1
                if isSpam:
                    self._addWord(word, self._spamWords)
                    self._totalSpamWords += 1
                else:
                    self._addWord(word, self._hamWords)
                    self._totalHamWords += 1
        self._probSpam = spamEmails/totalEmails
        self._probNotSpam = (totalEmails - spamEmails)/totalEmails

                
    ## Gives the probability that an email body is spam given whether or not the email is assumed as spam
    ## This function can give us P(B | A) and P(B | Not A)
    def conditionalEmailProb(self, email, isSpam):
        probability = 1
        for word in self._getEmailWords(email):
            probability *= self.conditionalWordProb(word, isSpam)
        return probability


    ## Gives the probability that a word is spam given whether or not the email is assumed as spam
    ## P(B_word | A_email)
    def conditionalWordProb(self, word, isSpam):
        if isSpam:
            return self._getWordOccurences(word, self._spamWords)/self._totalSpamWords
        else:
            return self._getWordOccurences(word, self._hamWords)/self._totalHamWords
        
        
    ## Main function that classifies emails as spam or not spam
    def classify(self, email):
        isSpam = self.conditionalEmailProb(email, True)
        notSpam = self.conditionalEmailProb(email, False)
        return isSpam > notSpam
    
    def evaluate(self, testingData):
        correctSpam = 0
        correctHam = 0
        spam = 0
        ham = 0
        for email in testingData:
            Ypred = self.classify(email)
            Y = email[3] == '1'
            if Y:
                spam += 1
                if Ypred:
                    correctSpam += 1
            else:
                ham += 1
                if not Ypred:
                    correctHam += 1
        total = spam + ham
        print("Finished Testing on {} emails,".format(total), end=' ')
        print("{} spam and {} ham".format(spam, ham))
        print("\nSpam emails correctly identified: {}/{}".format(correctSpam,spam))
        print("Ham emails correctly identified: {}/{}".format(correctHam,ham))
        print("----------------------------------------")
        print("Correctly identified emails with {}% accuracy!".format((correctHam + correctSpam)/total*100))
        return (correctHam + correctSpam)/total
    
    
    def accuracy(self, testingData):
        correctSpam = 0
        correctHam = 0
        spam = 0
        ham = 0
        for email in testingData:
            Ypred = self.classify(email)
            Y = email[3] == '1'
            if Y:
                spam += 1
                if Ypred:
                    correctSpam += 1
            else:
                ham += 1
                if not Ypred:
                    correctHam += 1
        return (correctHam + correctSpam)/(spam + ham)

In [91]:
from sklearn.model_selection import KFold

data_set = []
with open("C:/Users/sshim/JupyterNotebook/SpamEmailClassifier/train.csv", 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    for row in reader:
        data_set.append(row)

In [92]:
kf = KFold(5, shuffle=True, random_state=0)

for alpha in [.005, .01, .03, .07, .1]:
    acc = []
    for train_index, test_index in kf.split(data_set):
        trainingData = [data_set[i] for i in train_index]
        testingData = [data_set[i] for i in test_index]
        classifier = NaiveBayesClassifier(alpha) # Establishes our classifier
        classifier.train(trainingData)
        acc.append(classifier.accuracy(testingData))
    print(alpha, np.mean(acc))

0.005 0.8519575349002414
0.01 0.8520772953792835
0.03 0.8519576066560708
0.07 0.8515984687306035
0.1 0.8506402413866097


    alphabet case sensitive: 0.8412979911955597
    alphabet case sensitive, url: 0.8389025663472337
    alphabet not case sensitive: 0.8509998816028818
    alphabet not case sensitive, url: 0.8468077625456099
    alphabet not case sensitive, digit: 0.8389024228355751
    alphabet not case sensitive (also include title): 0.8520772953792835