# Naive Bayes From Scratch

In [1]:
import numpy as np
import pandas as pd

In [2]:
emails = pd.read_csv('../datasets/emails.csv')
emails.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [3]:
def splitter(text):
    return list(set(text.split()))

In [4]:
emails['words'] = emails['text'].apply(splitter)
emails.head()

Unnamed: 0,text,spam,words
0,Subject: naturally irresistible your corporate...,1,"[drafts, website, provide, information, conven..."
1,Subject: the stock trading gunslinger fanny i...,1,"[einsteinian, superior, penultimate, kansas, w..."
2,Subject: unbelievable new homes made easy im ...,1,"[homes, 3, extended, website, advantage, oppor..."
3,Subject: 4 color printing special request add...,1,"[pdf, additional, azusa, color, canyon, specia..."
4,"Subject: do not have money , get software cds ...",1,"[death, software, old, me, along, to, ., Subje..."


In [5]:
sum(emails['spam'])/len(emails)

0.2388268156424581

In [6]:
model = {}

for index , email in emails.iterrows():
    for word in email['words']:
        if word not in model:
            model[word] = {
                'spam' : 1 ,
                'ham' : 1
            }
        else:
            if email['spam']:
                model[word]['spam'] += 1
            else:
                model[word]['ham'] += 1

In [7]:
model['sale']

{'spam': 38, 'ham': 42}

In [14]:
def naive_bayes_predictor(email):
    words_in_email = list(set(email.split()))
    num_of_emails = len(emails)
    num_of_spam = sum(emails['spam'])
    num_of_ham = num_of_emails - num_of_spam
    spam = [1.0] 
    ham = [1.0]
    for word in words_in_email:
        if word in model:
            spam.append(model[word]['spam']/num_of_spam)
            ham.append(model[word]['ham']/num_of_ham)
    prod_spam = np.prod(spam) * (num_of_spam/num_of_emails)
    prod_ham = np.prod(ham) * (num_of_ham/num_of_emails)
    return prod_spam / (prod_spam + prod_ham)

In [11]:
naive_bayes_predictor('enjoy the lottery')

np.float64(0.8950828512404887)

In [12]:
naive_bayes_predictor('buy cheap lottery easy money now')

np.float64(0.9999688057220799)

In [15]:
naive_bayes_predictor('adshfhb')

np.float64(0.2388268156424581)

In [19]:
naive_bayes_predictor("You have to come to school tomorrow ")

np.float64(0.0032747762487985435)

In [20]:
naive_bayes_predictor('meet me at the lobby of the hotel at nine am')

np.float64(5.4197835174979816e-05)