In [6]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import math

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.stem.lancaster import LancasterStemmer

from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
data=pd.read_csv("spam.csv",encoding="ISO-8859-1")
data=data.iloc[:,0:2]
data['NUM_LABEL']=data.v1.map({'ham':0,'spam':1})
data.head()

Unnamed: 0,v1,v2,NUM_LABEL
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
def clean_data(text):
    txt=str(text)
    txt=re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt=re.sub(r'\n',r' ',txt)
    txt=" ".join([i.lower() for i in txt.split()])
    txt=" ".join([w for w in word_tokenize(txt)])
    
    stops=set(stopwords.words("english"))
    txt=" ".join([w for w in txt.split() if w not in stops])
    
    stemmer=LancasterStemmer()
    txt=" ".join([stemmer.stem(w) for w in txt.split()])
    return txt

data['v2']=data['v2'].map(lambda x:clean_data(x))

##collecting information from data
x_train,x_test,y_train,y_test=train_test_split(data.v2,data.NUM_LABEL,random_state=50,test_size=0.3)
label,freq=np.unique(y_train,return_counts=True)
print(label,freq)
prob_ham=freq[0]/(freq[0]+freq[1])
prob_spam=freq[1]/(freq[0]+freq[1])

#creating dictionary for ham and spam containing frequencies for words
dicham={}
dicspam={}
for i in range(0,len(x_train)):
    if(np.array(y_train)[i]==0):
        for w in np.array(x_train)[i].split():
            if w not in dicham:
                dicham[w]=1
            else:
                dicham[w]=dicham[w]+1
    else:
        for w in np.array(x_train)[i].split():
            if w not in dicspam:
                dicspam[w]=1
            else:
                dicspam[w]=dicspam[w]+1
                
##totaluniquewords is total number of unique words in the training data set
totaluniquewords=len(dicspam)+len(dicham)
totalspamwords=0
for i in dicspam.values():
    totalspamwords=totalspamwords+i
totalhamwords=0
for i in dicham.values():
    totalhamwords=totalhamwords+i


[0 1] [3399  501]


In [8]:
count=0
for i in range(0,len(x_test)):
    sum1=np.log(prob_ham)
    sum2=np.log(prob_spam)
    for w in np.array(x_test)[i].split():
        if w in dicham:
            temp=math.log((dicham[w]+1)/(freq[0]+totaluniquewords))
        else:
            temp=math.log(1/(freq[0]+totaluniquewords))
        sum1=sum1+temp
        if w in dicspam:
            temp=math.log((dicspam[w]+1)/(freq[1]+totaluniquewords))
        else:
            temp=math.log(1/(freq[1]+totaluniquewords))
        sum2=sum2+temp
    if(sum1>sum2):
        result=0
    else:
        result=1
    if(np.array(y_test)[i]==result):
        count=count+1
print("correct predictions out of",len(y_test),"is count",count)
print("accuracy is",(count*100)/len(y_test))

correct predictions out of 1672 is count 1625
accuracy is 97.18899521531101


In [9]:
##enter test mail
a="U dun say so early hor... U c already then say"
sum1=np.log(prob_ham)
sum2=np.log(prob_spam)
for w in a.split():
    if w in dicham:
        temp=math.log((dicham[w]+1)/(freq[0]+totaluniquewords))
    else:
        temp=math.log(1/(freq[0]+totaluniquewords))
    sum1=sum1+temp
    if w in dicspam:
        temp=math.log((dicspam[w]+1)/(freq[1]+totaluniquewords))
    else:
        temp=math.log(1/(freq[1]+totaluniquewords))
    sum2=sum2+temp
if(sum1>sum2):
    print("mail is not spam")
else:
    print("mail is spam")




mail is not spam
