# Lab 2 - Exercise 3 #
## Modified by: Santiago Carvajal

# Unigrams, Bigrams, and Trigrams in Naive Bayes Classifiers

Math of Intelligence Week 6 Challenge - https://www.youtube.com/watch?v=PrkiRVcrxOs&t=7s

In this notebook I will explore the performance of ngram words in a naive bayes classifier. I will look at how they perform across two data sets: 
    1) A Spam SMS dataset 
    2) Rap lines from Biggie Smalls and 2Pac

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
#Read data from TABLE 13.1
df = pd.read_csv('./Tabla131.csv', usecols=[0,1], encoding='latin-1')
df.columns = ['label','body']
# label spam as 1, not spam as 0
df['label'] = df['label'].replace(["no","yes"],[0,1])
data_131 = df.values
print(data_131)

[[1 'Chinese Beijing Chinese']
 [1 'Chinese Chinese Shanghai']
 [1 'Chinese Macao']
 [0 'Tokyo Japan Chinese']
 [1 'Chinese Chinese Chinese Tokyo Japan']]


In [3]:
#Read data from TABLE 13.10
df = pd.read_csv('./Tabla1310.csv', usecols=[0,1], encoding='latin-1')
df.columns = ['label','body']
# label spam as 1, not spam as 0
df['label'] = df['label'].replace(["no","yes"],[0,1])
data_1310 = df.values
print(data_1310)

[[1 'Taipei Taiwan']
 [1 'Macao Taiwan Shanghai']
 [0 'Japan Sapporo']
 [0 'Sapporo Osaka Taiwan']
 [0 'Taiwan Taiwan Sapporo']]


In [4]:
#Read data from IAML Edimburg
df = pd.read_csv('./IAML.csv', usecols=[0,1], encoding='latin-1')
df.columns = ['label','body']
# label spam as 1, not spam as 0
df['label'] = df['label'].replace(["ham","spam"],[0,1])
data_iaml = df.values
print(data_iaml)

[[1 'send us your password']
 [0 'send us your review']
 [0 'review your password']
 [1 'send us']
 [1 'send us your password']
 [1 'send us your account']
 [0 'review us now']]


In [5]:
class ngrams_bayes():
    
    def __init__(self, data, n=2, split=0.8):
        
        # split into training and testing data
        self.train_data, self.test_data = train_test_split(data,
                                                          train_size=split, shuffle=False) #random_state=69)
        # convert into n grams
        self.train_data = [[item[0], self.ngrams(n, item[1])] for item in self.train_data]
        self.test_data = [[item[0], self.ngrams(n, item[1])] for item in self.test_data]
        
        # count unique n grams in training data
        flattened = [gram for message in self.train_data for gram in message[1]]
        self.unique = len(set(flattened))
        
        # init dicts
        self.trainPositive = {}
        self.trainNegative = {}
        # counters
        self.posGramCount = 0
        self.negGramCount = 0
        self.spamCount = 0
        # priors
        self.pA = 0
        self.pNotA = 0
        
    def printTrain(self):
        print(self.train_data)
        
    def printTest(self):
        print(self.test_data)
        
    def ngrams(self, n, text):
        text = text.split(' ')
        grams = []
        for i in range(len(text)-n+1):
            gram = ' '.join(text[i:i+n])
            grams.append(gram)
        return grams 
    
    def train(self):
        
        for item in self.train_data:
            label = item[0]
            grams = item[1]
            if label == 1:
                self.spamCount += 1   
            for gram in grams:
                if label == 1:
                    self.trainPositive[gram] = self.trainPositive.get(gram, 0) + 1
                    self.posGramCount += 1
                else:
                    self.trainNegative[gram] = self.trainNegative.get(gram, 0) + 1
                    self.negGramCount += 1
                    
        self.pA = self.spamCount/float(len(self.train_data))
        self.pNotA = 1.0 - self.pA
        
    def classify(self, text, alpha=1.0):
        
        self.alpha = alpha
        isSpam = self.pA * self.conditionalText(text, 1)
        notSpam = self.pNotA * self.conditionalText(text, 0)
        print("Document belong to China probability: " + str(isSpam))
        print("Document not in China probability: " + str(notSpam))
        if (isSpam > notSpam):
            return 1
        else:
            return 0
        
    def conditionalText(self, grams, label):
        result = 1.0
        for ngram in grams:
            result *= self.conditionalNgram(ngram, label)
        return result
    
    def conditionalNgram(self, ngram, label):
        alpha = self.alpha
        if label == 1:
            return ((self.trainPositive.get(ngram,0)+alpha) /
                    float(self.posGramCount+alpha*self.unique))
        else:
            return ((self.trainNegative.get(ngram,0)+alpha) /
                    float(self.negGramCount+alpha*self.unique))
            
    def evaluate_test_data(self):
        results = []
        for test in self.test_data:
            label = test[0]
            text = test[1]
            ruling = self.classify(text)
            if ruling == label:
                results.append(1) 
            else:
                results.append(0) 
                
        print("Evaluated {} test cases. {:.2f}% Accuracy".format(len(results), 100.0*sum(results)/float(len(results))))
        return sum(results)/float(len(results))

In [6]:
print("--------Table 13.1 sets------------------------------------------------")
unigram_bayes = ngrams_bayes(data_131,1)
print("Train set :")
unigram_bayes.printTrain()
print("Test set :")
unigram_bayes.printTest()

print("--------Table 13.10 sets------------------------------------------------")
unigram_bayes2 = ngrams_bayes(data_1310,1)
print("Train set :")
unigram_bayes2.printTrain()
print("Test set :")
unigram_bayes2.printTest()

print("--------IMAL data sets------------------------------------------------")
unigram_bayes3 = ngrams_bayes(data_iaml,1,split=0.9)
print("Train set :")
unigram_bayes3.printTrain()
print("Test set :")
unigram_bayes3.printTest()

--------Table 13.1 sets------------------------------------------------
Train set :
[[1, ['Chinese', 'Beijing', 'Chinese']], [1, ['Chinese', 'Chinese', 'Shanghai']], [1, ['Chinese', 'Macao']], [0, ['Tokyo', 'Japan', 'Chinese']]]
Test set :
[[1, ['Chinese', 'Chinese', 'Chinese', 'Tokyo', 'Japan']]]
--------Table 13.10 sets------------------------------------------------
Train set :
[[1, ['Taipei', 'Taiwan']], [1, ['Macao', 'Taiwan', 'Shanghai']], [0, ['Japan', 'Sapporo']], [0, ['Sapporo', 'Osaka', 'Taiwan']]]
Test set :
[[0, ['Taiwan', 'Taiwan', 'Sapporo']]]
--------IMAL data sets------------------------------------------------
Train set :
[[1, ['send', 'us', 'your', 'password']], [0, ['send', 'us', 'your', 'review']], [0, ['review', 'your', 'password']], [1, ['send', 'us']], [1, ['send', 'us', 'your', 'password']], [1, ['send', 'us', 'your', 'account']]]
Test set :
[[0, ['review', 'us', 'now']]]




In [7]:
unigram_bayes.train()
unigram_bayes2.train()
unigram_bayes3.train()

In [8]:
print("--------Table 13.1 results------------------------------------------------")
unigram_bayes.evaluate_test_data()
print("--------Table 13.10 results------------------------------------------------")
unigram_bayes2.evaluate_test_data()
print("--------IAML data results ------------------------------------------------")
unigram_bayes3.evaluate_test_data()

--------Table 13.1 results------------------------------------------------
Document belong to China probability: 0.00030121377997263036
Document not in China probability: 0.00013548070246744226
Evaluated 1 test cases. 100.00% Accuracy
--------Table 13.10 results------------------------------------------------
Document belong to China probability: 0.0026041666666666665
Document not in China probability: 0.003472222222222222
Evaluated 1 test cases. 100.00% Accuracy
--------IAML data results ------------------------------------------------
Document belong to China probability: 0.00041666666666666675
Document not in China probability: 0.000910332271279017
Evaluated 1 test cases. 100.00% Accuracy


1.0

In [9]:
print("--------Table 13.1 results------------------------------------------------")
bigram_sms= ngrams_bayes(data_131,2) 
bigram_sms.train()
bigram_sms.evaluate_test_data()
print("--------Table 13.10 results------------------------------------------------")
bigram_sms2= ngrams_bayes(data_1310,2) 
bigram_sms2.train()
bigram_sms2.evaluate_test_data()
print("--------IAML data results------------------------------------------------")
bigram_sms3= ngrams_bayes(data_iaml,2) 
bigram_sms3.train()
bigram_sms3.evaluate_test_data()

--------Table 13.1 results------------------------------------------------
Document belong to China probability: 0.00014467592592592592
Document not in China probability: 7.620789513793628e-05
Evaluated 1 test cases. 100.00% Accuracy
--------Table 13.10 results------------------------------------------------
Document belong to China probability: 0.006172839506172839
Document not in China probability: 0.006172839506172839
Evaluated 1 test cases. 100.00% Accuracy
--------IAML data results------------------------------------------------
Document belong to China probability: 0.004166666666666667
Document not in China probability: 0.0016000000000000005
Document belong to China probability: 0.004166666666666667
Document not in China probability: 0.004000000000000001
Evaluated 2 test cases. 50.00% Accuracy




0.5

In [10]:
print("--------Table 13.1 results------------------------------------------------")
trigram_sms = ngrams_bayes(data_131,3) 
trigram_sms.train()
trigram_sms.evaluate_test_data()
print("--------Table 13.10 results------------------------------------------------")
trigram_sms = ngrams_bayes(data_1310,3) 
trigram_sms.train()
trigram_sms.evaluate_test_data()
print("--------IAML results------------------------------------------------")
trigram_sms = ngrams_bayes(data_iaml,3) 
trigram_sms.train()
trigram_sms.evaluate_test_data()

--------Table 13.1 results------------------------------------------------
Document belong to China probability: 0.006000000000000002
Document not in China probability: 0.00390625
Evaluated 1 test cases. 100.00% Accuracy
--------Table 13.10 results------------------------------------------------
Document belong to China probability: 0.16666666666666666
Document not in China probability: 0.16666666666666666
Evaluated 1 test cases. 100.00% Accuracy
--------IAML results------------------------------------------------
Document belong to China probability: 0.028124999999999997
Document not in China probability: 0.016326530612244896
Document belong to China probability: 0.075
Document not in China probability: 0.05714285714285714
Evaluated 2 test cases. 50.00% Accuracy




0.5