In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
from math import log, sqrt

In [2]:
data = pd.read_csv('data/spam.csv', encoding = 'latin-1')

In [3]:
data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)

In [4]:
data = data.rename(columns={'v1':'label', 'v2':'message'})

In [5]:
data['label'] = data['label'].map({'spam':1, 'ham':0})

In [6]:
data.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
def process_message(message, lower_case = True, stem = True, stop_words = True, gram = 2):
    if lower_case:
        message = message.lower()
    words = word_tokenize(message)
    words = [w for w in words if len(w) > 2]
    if gram > 1:
        w = []
        for i in range(len(words) - gram + 1):
            w += [' '.join(words[i:i + gram])]
        return w
    if stop_words:
        sw = stopwords.words('english')
        words = [word for word in words if word not in sw]
    if stem:
        stemmer = PorterStemmer()
        words = [stemmer.stem(word) for word in words]   
    return words

In [12]:
messages = data['message']
labels = data['label']

In [13]:
class bow:
    
    def __init__(self):
        print("Classifier is bag of Words")
    
    def train(self, message, label):
        
        self.freq_spam = dict([])
        self.freq_ham = dict([])
        self.total_spam_words = 0
        self.total_ham_words = 0
        
        for sentence in message[label == 1]:
            words = process_message(sentence)
            for word in words:
                if word in self.freq_spam.keys():
                    self.freq_spam[word] += 1
                else:
                    self.freq_spam[word] = 1
                self.total_spam_words += 1
                
        for sentence in message[label == 0]:
            words = process_message(sentence)
            for word in words:
                if word in self.freq_ham.keys():
                    self.freq_ham[word] += 1
                else:
                    self.freq_ham[word] = 1
                self.total_ham_words += 1
                
        self.total_words = self.total_spam_words + self.total_ham_words
        print("Training Done!")
                
    
    def predict(self, message):
        words = process_message(message)
        
        p_spam = 1
        
        for word in words:
            p2 = self.total_spam_words / (self.total_spam_words + self.total_ham_words)
            if not (word in self.freq_spam.keys()):
                self.freq_spam[word] = 0
            if not (word in self.freq_ham.keys()):
                self.freq_ham[word] = 0
            p1 = self.freq_spam[word] / self.total_spam_words
            p3 = (self.freq_spam[word] + self.freq_ham[word]) / self.total_words
            if p3 == 0:
                continue
            p_spam *= (p1/p3)    
            p_spam *= p2
        
        p_ham = 1
        
        for word in words:
            p2 = self.total_spam_words / (self.total_spam_words + self.total_ham_words)
            p1 = self.freq_ham[word] / self.total_ham_words
            p3 = (self.freq_spam[word] + self.freq_ham[word]) / self.total_words
            if p3 == 0:
                continue
            p_ham *= (p1/p3)    
            p_ham *= p2
              
        return 1 if p_spam > p_ham else 0    

In [14]:
b = bow()

Classifier is bag of Words


In [15]:
b.train(message=messages, label=labels)

Training Done!


In [16]:
pre = np.zeros((len(labels)))
for i  in range(0, len(messages)):
    pre[i] = b.predict(messages[i])

In [17]:
def metrics(labels, predictions):
    true_pos, true_neg, false_pos, false_neg = 0, 0, 0, 0
    for i in range(len(labels)):
        true_pos += int(labels[i] == 1 and predictions[i] == 1)
        true_neg += int(labels[i] == 0 and predictions[i] == 0)
        false_pos += int(labels[i] == 0 and predictions[i] == 1)
        false_neg += int(labels[i] == 1 and predictions[i] == 0)
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    Fscore = 2 * precision * recall / (precision + recall)
    accuracy = (true_pos + true_neg) / (true_pos + true_neg + false_pos + false_neg)

    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F-score: ", Fscore)
    print("Accuracy: ", accuracy)

In [18]:
metrics(labels, pre)

Precision:  0.9946737683089214
Recall:  1.0
F-score:  0.9973297730307076
Accuracy:  0.9992821249102656
