# Naive Bayes Classifier

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# Loading the data 

def load_data():
    df = pd.read_csv('spam.csv', encoding='latin-1')
    df_for_tests = df.head()
    
    idx = np.arange(df.shape[0])
    np.random.shuffle(idx)

    train_set_size = int(df.shape[0] * 0.8)

    train_set = df.loc[idx[:train_set_size]]
    test_set = df.loc[idx[train_set_size:]]
    
    return train_set, test_set, df_for_tests

In [3]:
train_set, test_set, df_for_tests = load_data()
print(df_for_tests)

     v1                                                 v2
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
# Clean the data

def clean_data(message):
    
    message = message.casefold()
    
    myRange_1 = [str(x) for x in range(0, 10)]
    myRange_2 = [chr(x) for x in range (97, 123)]  
    myRange = myRange_1 + myRange_2 + [' '] + ['.']
   
    for i in message:
        if i not in myRange:
            message = message.replace(i, '  ')
    
    message = re.sub(' +', ' ', message)
    message = message.replace('.','')
    message = message.strip()
    result = message
    return result

In [5]:
sentence = 'Doesn\'t get, how{to}% \\operate+66.7 :after[it]"" & lt;# & gt; won\'t `or(what)'
print('cleaned: ',clean_data(sentence))

cleaned:  doesn t get how to operate 667 after it lt gt won t or what


In [6]:
# Preparation data for model

def prep_for_model(train_set, test_set):
    
    train_set_x = np.array(train_set.iloc[:,1])
    buf = []
    for i in train_set_x:
        buf.append((clean_data(str(i))).split(' '))
    train_set_x = buf
    train_set_y = np.array (train_set.iloc[:,0])
    
    test_set_x = np.array(test_set.iloc[:,1])
    buf = []
    for i in test_set_x:
        buf.append((clean_data(str(i))).split(' '))
    test_set_x = buf
    test_set_y = np.array (test_set.iloc[:,0])
    
    return train_set_x, train_set_y, test_set_x, test_set_y

train_set_x, train_set_y, test_set_x, test_set_y = prep_for_model(train_set, test_set)

In [7]:
a1, a2, b1, b2 = prep_for_model(df_for_tests.head(3), df_for_tests.tail(2))
print(a2[0], a1[0])
print(b2[0], b1[0])


ham ['go', 'until', 'jurong', 'point', 'crazy', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']
ham ['u', 'dun', 'say', 'so', 'early', 'hor', 'u', 'c', 'already', 'then', 'say']


In [8]:
# Check words in categories

def categories_words(x_train, y_train):
    
    all_words_list = []
    ham_words_list = []
    spam_words_list = []
    
    all_words_list = np.array([item for sublist in x_train for item in sublist])

    for i in range(len(y_train)):
        if y_train[i] == 'ham':
            ham_words_list.append(x_train[i])
        else:
            spam_words_list.append(x_train[i])
    
    ham_words_list = np.array([item for sublist in ham_words_list for item in sublist])
    spam_words_list = np.array([item for sublist in spam_words_list for item in sublist])
    
    return all_words_list, ham_words_list, spam_words_list

all_words_list_a1, ham_words_list_a1, spam_words_list_a1 = categories_words(a1, a2)

In [9]:
print('first five "ham" words of a1: ', ham_words_list_a1[:5])

first five "ham" words of a1:  ['go' 'until' 'jurong' 'point' 'crazy']


In [10]:
class Naive_Bayes(object):
    """
    Parameters:
    -----------
    alpha: int
        The smoothing coeficient.
    """
    def __init__(self, alpha):
        self.alpha = alpha
        
        self.train_set_x = None
        self.train_set_y = None
        
        self.all_words_list = []
        self.ham_words_list = []
        self.spam_words_list = []
    
    def fit(self, train_set_x, train_set_y):

        self.all_words_list, self.ham_words_list, self.spam_words_list = categories_words (train_set_x, train_set_y)
        
        self.probsHam = {}
        self.probsSpam = {}
        
        uniqueWords = np.unique(self.all_words_list)
        
        unique, counts = np.unique(self.ham_words_list, return_counts = True)
        hamStat = dict(zip(unique, counts))
        hamNum = len(self.ham_words_list)
        hamAlpha = len(np.unique(self.ham_words_list))
        
        unique, counts = np.unique(self.spam_words_list, return_counts = True)
        spamStat = dict(zip(unique, counts))
        spamNum = len(self.spam_words_list)
        spamAlpha = len(np.unique(self.spam_words_list))
        
        for i in uniqueWords:
            
            if i in hamStat.keys():
                self.probsHam[i] = (hamStat[i]+self.alpha)/(hamNum + self.alpha*hamAlpha)
            else:
                self.probsHam[i] = (self.alpha)/(hamNum + self.alpha*hamAlpha)
                
            if i in spamStat.keys():
                self.probsSpam[i] = (spamStat[i]+self.alpha)/(spamNum + self.alpha*spamAlpha) 
            else:
                self.probsSpam[i] = (self.alpha)/(spamNum + self.alpha*spamAlpha) 
        
    def predict(self, test_set_x):
        
        
        prediction = []
        bufHam = 1
        bufSpam = 1
        
        pH = len(np.unique(self.ham_words_list))/len(np.unique(self.all_words_list))
        pS = len(np.unique(self.spam_words_list))/len(np.unique(self.all_words_list))
        print(pH, pS)
        
        
        for mail in test_set_x:
            for word in mail:
                if word in self.probsHam.keys() and word in self.probsSpam.keys():
                    bufHam = bufHam*self.probsHam[word]
                    bufSpam = bufSpam*self.probsSpam[word] 
            bufHam = bufHam*pH
            bufSpam = bufSpam*pS
            if bufHam > bufSpam:
                prediction.append("ham")
            else:
                prediction.append("spam")
            
            bufHam = 1
            bufSpam = 1
        
        return prediction

In [11]:
a = 1

In [12]:
model = Naive_Bayes(alpha=a)

In [13]:
model.fit(train_set_x, train_set_y)

In [14]:
y_predictions = model.predict(test_set_x)

0.7894411473788329 0.3317260138476756


In [15]:
actual = list(test_set_y)
accuracy = (y_predictions == test_set_y).mean()
print(accuracy)

0.9811659192825112
