In [1]:
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from math import log, sqrt
import pandas as pd
import numpy as np
import re
%matplotlib inline

messages = pd.read_csv('projectdata.csv', encoding = 'latin-1')
train = pd.read_csv('traindata.csv', encoding = 'latin-1')
test = pd.read_csv('testdata.csv', encoding = 'latin-1')

from textblob import Word

def dataprocess(message, lower_case = True, stem = True, stop_words = True, gram = 1, lem = True):
    if lower_case:
        message = message.lower()
    words = word_tokenize(message)
    words = [w for w in words if len(w) > 2]
    if gram > 1:
        w = []
        for i in range(len(words) - gram + 1):
            w += [' '.join(words[i:i + gram])]
        return w
    if stop_words:
        sw = stopwords.words('english')
        words = [word for word in words if word not in sw]
    if lem:
        words = [Word(word).lemmatize() for word in words]
    if stem:
        stemmer = PorterStemmer()
        words = [stemmer.stem(word) for word in words]   
    return words
#message = '? 1 wow!! going to school having fun arguing'
#preprocess(message)

class detect_spam(object):
    def __init__(self, train):
        self.message, self.score, self.len_train = train['message'], train['score'], len(train)
    
    def tfidf(self):
        
        self.totalspam, self.not_spam = self.score.value_counts()[1], self.score.value_counts()[0]
        self.words1 = 0
        self.words2 = 0
        self.spamcount = {}
        self.not_spamcount = {}
        self.idf_spam = {}
        self.idf_not_spam = {}
        self.spamprob = {}
        self.notspam_prob = {}
        self.sum_prob_spam = 0
        self.sum_prob_notspam = 0
        for i in range(self.len_train):
            processed = dataprocess(self.message[i])
            count = [] #to find idf whether the word is in message or not
            for word in processed:
                if self.score[i]:
                    self.spamcount[word] = self.spamcount.get(word, 0) + 1
                    self.words1 += 1
                else:
                    self.not_spamcount[word] = self.not_spamcount.get(word, 0) + 1
                    self.words2 += 1
                if word not in count:
                    count += [word]
            for word in count:
                if self.score[i]:
                    self.idf_spam[word] = self.idf_spam.get(word, 0) + 1
                else:
                    self.idf_not_spam[word] = self.idf_not_spam.get(word, 0) + 1
        
        
        for word in self.spamcount:
            self.spamprob[word] = (self.spamcount[word]) * log((self.len_train) / (self.idf_spam[word] + self.idf_not_spam.get(word, 0)))
            self.sum_prob_spam += self.spamprob[word]
        for word in self.spamcount:
            self.spamprob[word] = (self.spamprob[word] + 1) / (self.sum_prob_spam + len(list(self.spamprob.keys())))
            
        for word in self.not_spamcount:
            self.notspam_prob[word] = (self.not_spamcount[word]) * log((self.len_train) \
                                                          / (self.idf_spam.get(word, 0) + self.idf_not_spam[word]))
            self.sum_prob_notspam += self.notspam_prob[word]
        for word in self.not_spamcount:
            self.notspam_prob[word] = (self.notspam_prob[word] + 1) / (self.sum_prob_notspam + len(list(self.notspam_prob.keys())))
            
    
        self.prob_spam_mail, self.prob_ham_mail = self.totalspam / self.len_train, self.not_spam / self.len_train 
    
    def compare_prob(self, processed_message):
        finalprob_spam, finalprob_notspam = 0, 0
        for word in processed_message:                
            if word in self.spamprob:
                finalprob_spam += log(self.spamprob[word])
            else:
                finalprob_spam -= log(self.sum_prob_spam + len(list(self.spamprob.keys())))
                
            if word in self.notspam_prob:
                finalprob_notspam += log(self.notspam_prob[word])
            else:
                finalprob_notspam -= log(self.sum_prob_notspam + len(list(self.notspam_prob.keys()))) 
                
            finalprob_spam += log(self.prob_spam_mail)
            finalprob_notspam += log(self.prob_ham_mail)
            
        return finalprob_spam >= finalprob_notspam
    
    def predict(self, testmessage):
        predictions = {}
        for (i, message) in enumerate(testmessage):
            processed_message = dataprocess(message)
            predictions[i] = int(self.compare_prob(processed_message))
        return predictions
    
    def confusion_mat(self, testdata, prediction):
        true_pos, true_neg, false_pos, false_neg = 0, 0, 0, 0
        for i in range(len(testdata)):
            true_pos += int(testdata[i] == 1 and prediction[i] == 1)
            true_neg += int(testdata[i] == 0 and prediction[i] == 0)
            false_pos += int(testdata[i] == 0 and prediction[i] == 1)
            false_neg += int(testdata[i] == 1 and prediction[i] == 0)
        precision = true_pos / (true_pos + false_pos)
        recall = true_pos / (true_pos + false_neg)
        Fscore = 2 * precision * recall / (precision + recall)
        accuracy = (true_pos + true_neg) / (true_pos + true_neg + false_pos + false_neg)
        TPR = true_pos / (true_pos + false_neg)
        FPR = false_pos / (false_pos + true_neg )

        print("Precision: ", precision)
        print("Recall: ", recall)
        print("F-score: ", Fscore)
        print("Accuracy: ", accuracy)
        print("TPR:", TPR)
        print("FPR:", FPR)

In [2]:
call = detect_spam(train)
call.tfidf()
pred = call.predict(test['message'])
call.confusion_mat(test['score'], pred)

Precision:  0.9369369369369369
Recall:  0.7074829931972789
F-score:  0.8062015503875969
Accuracy:  0.9551569506726457
TPR: 0.7074829931972789
FPR: 0.007231404958677686


In [17]:
pm = dataprocess('URGENT! Your Mobile No. was awarded å£2000 Bonus')
print(call.compare_prob(pm))

pm = dataprocess('Going for dinner')
print(call.compare_prob(pm))

pm = dataprocess('You are a winner. You have been specially selected to receive money')
print(call.compare_prob(pm))

pm = dataprocess('I am going to see you today')
print(call.compare_prob(pm))

pm = dataprocess('Reply with your name and address and receive award')
print(call.compare_prob(pm))

pm = dataprocess('Hi,This exclusive download is not available to the general public.This is your Private Access Only. Click Here To Get Instant Access Now. Only One Download License Per User Member Solutions. Talk soon,')
print(call.compare_prob(pm))

True
False
True
False
True
False
