In [8]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import collections
from collections import Counter 

nltk.download('wordnet')

def extract_data():
    # Loading data
    dataset = pd.read_csv('emails.csv')
    print(dataset .columns) #Index(['text', 'spam'], dtype='object')
    print(dataset.shape)  #(5728, 2)

    #Pre-processing

    #Checking for duplicates and removing them
    dataset.drop_duplicates(inplace = True)
    print(dataset.shape)  #(5695, 2)

    #Checking for any null entries in the dataset
    print (pd.DataFrame(dataset.isnull().sum()))

    #Removing subject:
    dataset['text']=dataset['text'].map(lambda text: text[9:])
    #Removing numerical values
    dataset['text'] = dataset['text'].apply(lambda x: re.sub(r'\d+(\.\d+)?', '',x))
    #Removing punctuator
    dataset['text'] = dataset['text'].apply(lambda x: re.sub(r'[^\w\d\s]', ' ',x))
    #Removing whitespace
    dataset['text'] = dataset['text'].apply(lambda x: re.sub(r'\s+', ' ',x))
    #Removing leading and trailing whitespace
    dataset['text'] = dataset['text'].apply(lambda x: re.sub(r'^\s+|\s+?$', '',x))
    #Removing stopwords
    stop_words = ['_','i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    dataset['text'] = dataset['text'].apply(lambda x: ' '.join(
        term for term in x.split() if term not in set(stop_words))
    )
    #Removing one and two letter words 
    dataset['text'] = dataset['text'].apply(lambda x: re.sub(r'\b\w{1,2}\b','',x))
    new_list=list(dataset['text'])
    label=list(dataset['spam'])
    return new_list,label
    
new_list,label = extract_data()
count_spam=len([i for i in label if i==1])
count_ham=len([i for i in label if i==0])
print("number of spamand ham resp.",count_spam,count_ham)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bandriya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Index(['text', 'spam'], dtype='object')
(5728, 2)
(5695, 2)
      0
text  0
spam  0
number of spamand ham resp. 1368 4327


In [9]:
# Making dictionary of importent words
#def Make_dict(new_list):
split_list=[]
for i in range(len(new_list)):
    split_list.append(new_list[i].split())

str2 = [] 
spam_diction={}
ham_diction={}
diction={}
lemmatizer = WordNetLemmatizer()
# loop till string values present in list str 
for j in range(len(new_list)):              
    for i in range(len(split_list[j])):
        # lemmatize data
        split_list[j][i]=lemmatizer.lemmatize(split_list[j][i])
        if split_list[j][i] not in str2: 
            # insert value in str2 
            str2.append(split_list[j][i])
            spam_diction[split_list[j][i]]=0
            ham_diction[split_list[j][i]]=0
            diction[split_list[j][i]]=1
            if(label[j]==1):
                spam_diction[split_list[j][i]]=1
            else:
                ham_diction[split_list[j][i]]=1
        else:
            diction[split_list[j][i]]+=1
            if(label[j]==1):
                spam_diction[split_list[j][i]]+=1
            else:
                ham_diction[split_list[j][i]]+=1


spam_diction_c=spam_diction.copy()
ham_diction_c=ham_diction.copy()

for i in spam_diction_c.copy():
    if spam_diction_c[i] < 25: 
        spam_diction_c.pop(i)
for i in ham_diction_c.copy():
    if ham_diction_c[i] < 50: 
        ham_diction_c.pop(i)
diction_c=Counter(spam_diction_c)+Counter(ham_diction_c)

print("length of spam", len(spam_diction_c))
print("length of ham", len(ham_diction_c))
print("total number of used words ",len(diction_c))
t_s=sum(spam_diction_c.values())
t_h=sum(ham_diction_c.values())
t_total=t_s+t_h


length of spam 1211
length of ham 1890
total number of used words  2299


In [11]:
# funtion to preprocess data
def pre_process_data(dataset):
    dataset =re.sub(r'\d+(\.\d+)?', '',dataset)
    dataset = re.sub(r'[^\w\d\s]', ' ',dataset)
    dataset= re.sub(r'\s+', ' ',dataset)
    dataset = re.sub(r'^\s+|\s+?$', '',dataset)
    dataset = re.sub(r'\b\w{1,2}\b','',dataset)
    dataset = dataset.lower()
    data_list=list(dataset)
    lemmatizer = WordNetLemmatizer()
    split_list=[]
    split_list=dataset.split()
    for i in range(len(split_list)):
        split_list[i]=lemmatizer.lemmatize(split_list[i])
    
    return split_list

In [12]:
#fetching data from first test email
file1 = open("./test/email1.txt","r")
data= file1.read()
test1=pre_process_data(data)
prior_spam=1368/(1368+4327)
prior_ham=4327/(1368+4327)
posterior_spam=prior_spam
posterior_ham=prior_ham
for i in test1:  
    #print(i)
    Clist=Counter(spam_diction_c)+ Counter(ham_diction_c)
    if i in Clist.copy():
        if i in spam_diction_c.copy():
            #print('#',np.log((spam_diction_c[i]+1)/(count_spam+2)))
            posterior_spam+=np.log((spam_diction_c[i]+1)/(count_spam+2))
        else: 
            #print('##',np.log(1/(count_spam+2)))
            posterior_spam+=np.log(1/(count_spam+2))

        if i in ham_diction_c.copy():
            #print('*',np.log((ham_diction_c[i]+1)/(count_ham+2)))
            posterior_ham+=np.log((ham_diction_c[i]+1)/(count_ham+2))
        else: 
            #print('**',np.log(1/(count_ham+2)))
            posterior_ham+=np.log(1/(count_ham+2))

        #word_prob[i]=(diction[i]+t_total)/(t_total*(1+len(diction)))
posterior_spam=posterior_spam*prior_spam
posterior_ham=posterior_ham*prior_ham
print(posterior_spam,posterior_ham)
if(posterior_spam>posterior_ham):
    print('spam')
else:
    print('ham')


-27.388832682804065 -61.793988698259646
spam


In [13]:
#fetching data from second test email
file2 = open("./test/email2.txt","r")
data= file2.read()
test2=pre_process_data(data)
prior_spam=1368/(1368+4327)
prior_ham=4327/(1368+4327)
posterior_spam=0
posterior_ham=0
for i in test2:   
    #print(i)
    Clist=Counter(spam_diction_c)+ Counter(ham_diction_c)
    if i in (Clist):
        if i in spam_diction_c.copy():
            #print('#',np.log((spam_diction_c[i]+1)/(count_spam+2)))
            posterior_spam+=np.log((spam_diction_c[i]+1)/(count_spam+2))
        else: 
            #print('##',np.log(1/(count_spam+2)))
            posterior_spam+=np.log(1/(count_spam+2))

        if i in ham_diction_c.copy():
            #print('*',np.log((ham_diction_c[i]+1)/(count_ham+2)))
            posterior_ham+=np.log((ham_diction_c[i]+1)/(count_ham+2))
        else: 
            #print('**',np.log(1/(count_ham+2)))
            posterior_ham+=np.log(1/(count_ham+2))

posterior_spam=posterior_spam*prior_spam
posterior_ham=posterior_ham*prior_ham
print(posterior_spam,posterior_ham)
if(posterior_spam>posterior_ham):
    print('spam')
else:
    print('ham')

-56.01590411009303 -187.65287600027497
spam


In [19]:
def check_spam(data):
    test=pre_process_data(data)
    prior_spam=1368/(1368+4327)
    prior_ham=4327/(1368+4327)
    posterior_spam=0
    posterior_ham=0
    for i in test:   
        #print(i)
        Clist=Counter(spam_diction_c)+ Counter(ham_diction_c)
        if i in (Clist):
            if i in spam_diction_c.copy():
                #print('#',np.log((spam_diction_c[i]+1)/(count_spam+2)))
                posterior_spam+=np.log((spam_diction_c[i]+1)/(count_spam+2))
            else: 
                #print('##',np.log(1/(count_spam+2)))
                posterior_spam+=np.log(1/(count_spam+2))

            if i in ham_diction_c.copy():
                #print('*',np.log((ham_diction_c[i]+1)/(count_ham+2)))
                posterior_ham+=np.log((ham_diction_c[i]+1)/(count_ham+2))
            else: 
                #print('**',np.log(1/(count_ham+2)))
                posterior_ham+=np.log(1/(count_ham+2))

    posterior_spam=posterior_spam+ np.log(prior_spam)
    posterior_ham=posterior_ham+np.log(prior_ham)
    #print(posterior_spam,posterior_ham)
    if(posterior_spam>posterior_ham):
        return 1
    else:
        return 0

In [20]:
# accuracy test
dataset = pd.read_csv('spam_ham.csv',encoding= 'latin-1')
print(dataset .columns) #Index(['text', 'spam'], dtype='object')
print(dataset.shape)  #(5728, 2)
l=list(dataset['v1'])
for i in range(len(dataset)):
    if l[i]=='spam':
        label[i]=1
    else:
        label[i]=0
d=list(dataset['v2'])
print(d[0])

count=0
new_lable=[]
for i in range(len(dataset)):
    gg=check_spam(d[i])
    print(i,'  ',label[i],'   ',gg)
    # print(labe[i],"   ",new_label[i])
    if(label[i]==gg):
        count+=1

print(count)      

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')
(5572, 5)
Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
0    0     0
1    0     0
2    1     1
3    0     0
4    0     0
5    1     0
6    0     0
7    0     0
8    1     1
9    1     1
10    0     0
11    1     1
12    1     1
13    0     1
14    0     0
15    1     1
16    0     0
17    0     1
18    0     0
19    1     0
20    0     0
21    0     0
22    0     1
23    0     0
24    0     0
25    0     0
26    0     0
27    0     0
28    0     0
29    0     0
30    0     1
31    0     0
32    0     0
33    0     0
34    1     1
35    0     0
36    0     0
37    0     0
38    0     0
39    0     0
40    0     0
41    0     1
42    1     1
43    0     0
44    0     1
45    0     0
46    0     1
47    0     0
48    0     0
49    0     0
50    0     0
51    0     0
52    0     0
53    0     0
54    1     0
55    0     0
56    1     0
57    0     

541    0     1
542    0     0
543    0     0
544    0     1
545    0     0
546    0     0
547    0     0
548    0     0
549    0     0
550    0     1
551    0     0
552    0     0
553    0     0
554    0     0
555    0     0
556    0     0
557    0     0
558    0     0
559    0     1
560    0     0
561    0     0
562    0     1
563    1     1
564    0     0
565    0     0
566    0     1
567    0     0
568    0     1
569    0     0
570    0     0
571    0     0
572    0     0
573    0     0
574    0     0
575    1     1
576    0     0
577    0     0
578    1     1
579    0     1
580    0     0
581    0     0
582    1     1
583    0     1
584    0     1
585    0     0
586    0     0
587    0     0
588    0     0
589    0     0
590    1     1
591    1     1
592    1     1
593    0     0
594    0     0
595    0     0
596    0     0
597    1     0
598    0     0
599    0     0
600    0     1
601    0     1
602    0     0
603    0     0
604    0     0
605    0     0
606    1     0
607    0  

1084    0     1
1085    0     0
1086    0     0
1087    0     1
1088    1     1
1089    0     0
1090    1     1
1091    0     0
1092    0     0
1093    0     0
1094    0     0
1095    0     0
1096    1     1
1097    0     0
1098    0     0
1099    0     0
1100    0     0
1101    0     0
1102    0     0
1103    0     1
1104    1     0
1105    0     0
1106    0     0
1107    0     0
1108    0     0
1109    0     0
1110    0     0
1111    0     0
1112    0     0
1113    0     0
1114    0     0
1115    0     0
1116    0     0
1117    1     1
1118    0     0
1119    1     0
1120    0     1
1121    1     0
1122    0     0
1123    0     0
1124    0     0
1125    1     0
1126    0     0
1127    0     0
1128    1     1
1129    0     0
1130    0     0
1131    0     0
1132    0     0
1133    0     0
1134    0     0
1135    0     0
1136    1     0
1137    0     0
1138    0     0
1139    0     0
1140    0     0
1141    1     0
1142    0     0
1143    0     0
1144    0     0
1145    1     1
1146    

1603    0     0
1604    0     1
1605    0     0
1606    0     0
1607    0     0
1608    0     1
1609    0     0
1610    0     0
1611    0     0
1612    1     1
1613    0     0
1614    0     0
1615    0     0
1616    0     0
1617    0     1
1618    0     0
1619    0     0
1620    0     1
1621    0     1
1622    1     1
1623    0     0
1624    1     1
1625    0     0
1626    0     0
1627    1     0
1628    0     0
1629    0     0
1630    0     0
1631    0     1
1632    0     1
1633    0     0
1634    1     0
1635    0     1
1636    0     0
1637    1     0
1638    0     0
1639    1     0
1640    0     1
1641    0     0
1642    0     0
1643    0     0
1644    0     0
1645    0     0
1646    0     0
1647    0     0
1648    0     0
1649    0     0
1650    0     0
1651    0     0
1652    1     1
1653    0     1
1654    0     0
1655    0     0
1656    0     0
1657    0     1
1658    1     1
1659    0     0
1660    0     0
1661    0     0
1662    1     1
1663    0     0
1664    0     0
1665    

2123    1     1
2124    0     0
2125    0     0
2126    0     0
2127    0     1
2128    0     0
2129    0     1
2130    0     0
2131    0     0
2132    1     1
2133    0     0
2134    0     0
2135    0     0
2136    0     0
2137    0     0
2138    0     0
2139    0     0
2140    0     1
2141    0     0
2142    0     0
2143    0     0
2144    1     1
2145    1     1
2146    0     0
2147    0     0
2148    0     0
2149    0     0
2150    0     0
2151    0     0
2152    0     0
2153    0     0
2154    0     1
2155    0     0
2156    0     0
2157    0     0
2158    0     0
2159    1     0
2160    0     0
2161    0     0
2162    0     0
2163    0     0
2164    0     1
2165    0     0
2166    0     0
2167    0     0
2168    0     0
2169    1     1
2170    0     0
2171    0     0
2172    0     1
2173    0     0
2174    0     0
2175    0     0
2176    0     1
2177    0     0
2178    0     0
2179    0     0
2180    0     0
2181    0     0
2182    0     0
2183    0     0
2184    0     0
2185    

2655    0     0
2656    0     0
2657    0     0
2658    0     0
2659    0     0
2660    0     0
2661    0     0
2662    1     0
2663    1     1
2664    0     0
2665    0     0
2666    0     0
2667    0     0
2668    1     0
2669    1     1
2670    0     0
2671    0     0
2672    0     0
2673    0     0
2674    0     0
2675    0     1
2676    0     0
2677    0     1
2678    0     0
2679    1     0
2680    0     1
2681    0     0
2682    0     0
2683    0     1
2684    0     0
2685    1     1
2686    0     0
2687    0     0
2688    0     0
2689    0     0
2690    1     1
2691    0     0
2692    1     0
2693    0     0
2694    0     0
2695    0     0
2696    0     0
2697    0     0
2698    1     0
2699    0     0
2700    0     1
2701    0     0
2702    0     0
2703    0     0
2704    1     1
2705    0     1
2706    0     0
2707    1     0
2708    0     0
2709    0     1
2710    1     0
2711    0     1
2712    0     0
2713    0     0
2714    0     0
2715    0     0
2716    0     0
2717    

3196    0     1
3197    0     0
3198    0     0
3199    0     0
3200    0     0
3201    0     0
3202    0     0
3203    0     0
3204    0     0
3205    0     0
3206    0     0
3207    0     0
3208    0     0
3209    0     1
3210    0     0
3211    0     0
3212    0     0
3213    0     1
3214    0     0
3215    1     1
3216    0     0
3217    0     0
3218    0     1
3219    0     0
3220    1     0
3221    0     0
3222    0     0
3223    0     0
3224    0     1
3225    0     1
3226    0     1
3227    1     1
3228    1     1
3229    0     0
3230    0     0
3231    0     1
3232    0     1
3233    0     0
3234    0     0
3235    0     0
3236    0     0
3237    0     0
3238    0     0
3239    0     1
3240    0     0
3241    0     0
3242    0     0
3243    0     1
3244    0     0
3245    0     1
3246    0     0
3247    0     0
3248    0     0
3249    0     1
3250    0     0
3251    0     0
3252    0     0
3253    0     0
3254    0     0
3255    0     0
3256    0     0
3257    0     0
3258    

3748    1     1
3749    0     0
3750    0     0
3751    0     0
3752    0     0
3753    1     0
3754    0     0
3755    0     0
3756    1     0
3757    0     1
3758    0     1
3759    0     0
3760    0     0
3761    1     1
3762    0     0
3763    0     0
3764    1     0
3765    0     0
3766    0     1
3767    0     0
3768    0     0
3769    0     1
3770    0     1
3771    0     0
3772    0     0
3773    0     0
3774    0     0
3775    0     0
3776    1     1
3777    0     0
3778    1     1
3779    0     0
3780    0     0
3781    0     0
3782    0     1
3783    0     0
3784    0     0
3785    0     0
3786    0     0
3787    1     0
3788    0     0
3789    0     1
3790    1     0
3791    0     1
3792    0     0
3793    0     0
3794    0     1
3795    0     0
3796    0     0
3797    0     1
3798    0     0
3799    1     1
3800    0     0
3801    0     0
3802    0     1
3803    0     1
3804    0     0
3805    1     1
3806    0     0
3807    0     0
3808    0     0
3809    0     1
3810    

4289    0     1
4290    0     0
4291    0     0
4292    0     1
4293    1     1
4294    1     1
4295    1     0
4296    1     0
4297    1     0
4298    0     0
4299    0     0
4300    0     0
4301    0     0
4302    0     1
4303    0     1
4304    0     0
4305    0     0
4306    0     0
4307    0     0
4308    0     0
4309    1     0
4310    0     1
4311    0     1
4312    0     0
4313    0     0
4314    0     0
4315    0     0
4316    0     1
4317    0     0
4318    0     0
4319    0     0
4320    0     0
4321    0     0
4322    0     0
4323    0     1
4324    0     1
4325    0     0
4326    0     0
4327    1     1
4328    0     1
4329    0     0
4330    0     1
4331    0     0
4332    0     0
4333    0     0
4334    0     0
4335    0     0
4336    0     0
4337    0     0
4338    0     0
4339    0     0
4340    0     0
4341    0     0
4342    0     1
4343    0     0
4344    0     1
4345    0     0
4346    1     1
4347    0     0
4348    0     0
4349    0     1
4350    0     0
4351    

4832    1     1
4833    0     0
4834    0     0
4835    0     0
4836    0     1
4837    0     0
4838    0     0
4839    1     1
4840    0     0
4841    0     0
4842    0     0
4843    1     1
4844    0     0
4845    0     0
4846    0     1
4847    0     1
4848    0     0
4849    0     0
4850    0     1
4851    0     0
4852    0     0
4853    0     0
4854    0     0
4855    0     0
4856    0     0
4857    0     0
4858    0     0
4859    0     1
4860    0     0
4861    1     0
4862    1     0
4863    0     0
4864    0     0
4865    0     0
4866    0     0
4867    0     1
4868    0     1
4869    0     0
4870    0     0
4871    0     0
4872    0     0
4873    0     0
4874    0     0
4875    1     1
4876    0     0
4877    1     1
4878    0     1
4879    0     0
4880    0     0
4881    0     0
4882    0     0
4883    0     1
4884    0     1
4885    0     0
4886    1     0
4887    0     0
4888    0     1
4889    0     0
4890    0     0
4891    0     0
4892    0     0
4893    0     0
4894    

5348    0     0
5349    0     0
5350    0     0
5351    0     0
5352    0     0
5353    0     1
5354    0     1
5355    0     1
5356    0     0
5357    0     0
5358    0     0
5359    0     1
5360    0     0
5361    0     0
5362    0     0
5363    0     1
5364    1     1
5365    1     1
5366    1     1
5367    0     0
5368    1     1
5369    0     0
5370    1     0
5371    0     0
5372    0     0
5373    0     0
5374    0     1
5375    0     0
5376    0     0
5377    1     0
5378    1     1
5379    0     0
5380    0     0
5381    1     0
5382    0     0
5383    0     0
5384    0     0
5385    0     0
5386    0     0
5387    0     0
5388    0     0
5389    0     0
5390    0     0
5391    0     0
5392    0     0
5393    0     1
5394    0     0
5395    0     0
5396    0     1
5397    0     0
5398    0     0
5399    0     0
5400    0     0
5401    0     1
5402    0     1
5403    0     1
5404    0     0
5405    0     0
5406    0     0
5407    0     0
5408    0     0
5409    0     0
5410    

In [21]:
4473/5572

0.8027638190954773

In [460]:
len([i for i in label if i==0])

4948