In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
df = pd.read_csv('spam.csv', encoding = "ISO-8859-1")
df = df[['v1','v2']]
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
v1    5572 non-null object
v2    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [4]:
len(df["v2"])

5572

In [5]:
len(df["v2"].unique())

5169

## Removing duplicates

In [6]:
df.drop_duplicates(subset=None, inplace=True)

In [7]:
len(df["v2"])

5169

In [8]:
arr, indexes = np.unique(df[:]["v2"],return_index=True)

In [9]:
df_clean = df.iloc[indexes,:]

In [10]:
df_clean.head()

Unnamed: 0,v1,v2
4024,ham,&lt;#&gt; in mca. But not conform.
3014,ham,&lt;#&gt; mins but i had to stop somewhere f...
4264,ham,&lt;DECIMAL&gt; m but its not a common car he...
2765,ham,and picking them up from various points
3671,ham,"came to look at the flat, seems ok, in his 50..."


In [11]:
df_clean["v2"].head()

4024                  &lt;#&gt;  in mca. But not conform.
3014     &lt;#&gt;  mins but i had to stop somewhere f...
4264     &lt;DECIMAL&gt; m but its not a common car he...
2765             and  picking them up from various points
3671     came to look at the flat, seems ok, in his 50...
Name: v2, dtype: object

In [12]:
X=df_clean["v2"].values
y=df_clean["v1"].values

## Remove punctuation and Remove not relevant special characters

In [13]:
import string
def remove_punct(message):
    for punctuation in string.punctuation:
        message = message.replace(punctuation, '')
    return message
df['v2'] = df['v2'].apply(remove_punct)
df['v2'] = df['v2'].str.replace('\W', ' ')

In [14]:
df.head()

Unnamed: 0,v1,v2
0,ham,Go until jurong point crazy Available only in ...
1,ham,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor U c already then say
4,ham,Nah I dont think he goes to usf he lives aroun...


In [15]:
df.shape[0]

5169

In [16]:
y[1]

'ham'

In [17]:
df[df["v1"]=="ham"].count()

v1    4516
v2    4516
dtype: int64

In [18]:
df[df["v1"]=="spam"].count()

v1    653
v2    653
dtype: int64

## Here out of 5196 data 4516 are ham so this is unbalanced distribution so we have to perform stratified split

In [19]:
sfs = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
sfs.get_n_splits(X, y)

1

In [20]:
for train_index, test_index in sfs.split(X,y):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test= X[test_index], y[test_index]

## converting strings to integer counts

In [21]:
all_words = {}
total_all_words = 0
for email in X_train:
    for word in email.split():
        if word not in all_words:
            all_words[word]=1
        else:
            all_words[word]+=1
for i,j in all_words.items():
    total_all_words+=j
print(total_all_words)
len(all_words)

56020


12383

In [22]:
## Probability of SPAM


In [23]:
all_words

{'Big': 1,
 'brother\x89Û÷s': 1,
 'really': 49,
 'scraped': 1,
 'the': 812,
 'barrel': 1,
 'with': 228,
 'this': 165,
 'shower': 5,
 'of': 382,
 'social': 3,
 'misfits': 1,
 'Nope': 5,
 "i'm": 54,
 'not': 225,
 'drivin...': 1,
 'I': 977,
 'neva': 8,
 'develop': 2,
 'da': 43,
 'photos': 3,
 'lei...': 10,
 '\\Response\\"': 1,
 'is': 496,
 'one': 90,
 'd': 37,
 'powerful': 1,
 'weapon': 1,
 '2': 322,
 'occupy': 2,
 'a': 827,
 'place': 28,
 'in': 517,
 'others': 6,
 "'HEART'...": 1,
 'So': 66,
 'Boo.': 1,
 "How's": 8,
 'things?': 2,
 "I'm": 177,
 'back': 77,
 'at': 217,
 'home': 70,
 'and': 589,
 'little': 20,
 'bored': 9,
 'already': 31,
 ':-(': 11,
 'Sorry,': 6,
 "I'll": 62,
 'call': 194,
 'later': 35,
 'ok': 42,
 'bye': 2,
 '<Forwarded': 4,
 'from': 167,
 '88877>FREE': 1,
 'entry': 16,
 'into': 25,
 'our': 87,
 'å£250': 12,
 'weekly': 13,
 'comp': 7,
 'just': 160,
 'send': 98,
 'word': 15,
 'ENTER': 1,
 'to': 1394,
 '88877': 1,
 'NOW.': 5,
 '18': 12,
 'T&C': 9,
 'www.textcomp.com': 1,
 

## obtain frequency of the words

In [24]:
uniq = []
freq_ham = {}
freq_spam = {}
total_ham = 0
total_spam = 0
for i,email in enumerate(X_train):
    if y_train[i]=="ham":
        for word in email.split():
            if word not in freq_ham:
                freq_ham[word]=1
            else:
                freq_ham[word]+=1
    else:
        for word in email.split():
            if word not in freq_spam:
                freq_spam[word]=1
            else:
                freq_spam[word]+=1

In [25]:
for i,j in freq_spam.items():
    total_spam+=j
print(total_spam)
for i,j in freq_ham.items():
    total_ham+=j
print(total_ham)

10861
45159


In [26]:
len(freq_spam)

3421

In [27]:
len(freq_ham)

9905

In [28]:
freq_spam

{'<Forwarded': 4,
 'from': 68,
 '88877>FREE': 1,
 'entry': 16,
 'into': 5,
 'our': 52,
 'å£250': 12,
 'weekly': 13,
 'comp': 6,
 'just': 28,
 'send': 27,
 'the': 113,
 'word': 7,
 'ENTER': 1,
 'to': 376,
 '88877': 1,
 'NOW.': 4,
 '18': 12,
 'T&C': 9,
 'www.textcomp.com': 1,
 'Dont': 2,
 'forget': 2,
 'you': 98,
 'can': 16,
 'place': 3,
 'as': 9,
 'many': 3,
 'FREE': 53,
 'Requests': 1,
 'with': 65,
 '1stchoice.co.uk': 1,
 'wish.': 2,
 'For': 16,
 'more': 10,
 'Information': 1,
 'call': 103,
 '08707808226.': 1,
 'Good': 6,
 'Luck!': 2,
 'Draw': 1,
 'takes': 2,
 '28th': 1,
 'Feb': 1,
 '06.': 1,
 'removal': 2,
 'STOP': 27,
 '87239': 4,
 'customer': 17,
 'services': 9,
 '08708034412': 2,
 '5p': 1,
 '4': 52,
 'alfie': 2,
 "Moon's": 3,
 'Children': 3,
 'in': 40,
 'need': 5,
 'song': 3,
 'on': 77,
 'ur': 68,
 'mob.': 3,
 'Tell': 3,
 'm8s.': 3,
 'Txt': 39,
 'Tone': 6,
 'charity': 5,
 '8007': 15,
 'for': 115,
 'Nokias': 2,
 'or': 107,
 'Poly': 3,
 'polys:': 2,
 'zed': 5,
 '08701417012': 3,
 'pr

In [29]:
freq_ham

{'Big': 1,
 'brother\x89Û÷s': 1,
 'really': 48,
 'scraped': 1,
 'the': 699,
 'barrel': 1,
 'with': 163,
 'this': 133,
 'shower': 5,
 'of': 334,
 'social': 3,
 'misfits': 1,
 'Nope': 5,
 "i'm": 54,
 'not': 214,
 'drivin...': 1,
 'I': 963,
 'neva': 8,
 'develop': 2,
 'da': 43,
 'photos': 3,
 'lei...': 10,
 '\\Response\\"': 1,
 'is': 407,
 'one': 86,
 'd': 37,
 'powerful': 1,
 'weapon': 1,
 '2': 204,
 'occupy': 2,
 'a': 624,
 'place': 25,
 'in': 477,
 'others': 6,
 "'HEART'...": 1,
 'So': 65,
 'Boo.': 1,
 "How's": 8,
 'things?': 2,
 "I'm": 173,
 'back': 65,
 'at': 198,
 'home': 69,
 'and': 510,
 'little': 19,
 'bored': 9,
 'already': 30,
 ':-(': 11,
 'Sorry,': 6,
 "I'll": 62,
 'call': 91,
 'later': 35,
 'ok': 40,
 'bye': 2,
 'Okay,': 1,
 'good,': 6,
 'no': 68,
 'problem,': 1,
 'thanx!': 1,
 'will': 187,
 'reach': 15,
 'before': 32,
 'ten': 7,
 'morning': 22,
 'Yo': 9,
 'you': 1025,
 'around?': 4,
 'A': 48,
 'friend': 20,
 "mine's": 1,
 'lookin': 2,
 'to': 1018,
 'pick': 34,
 'up': 171,
 '

## creating the model using Naïve Bayes formula 

In [30]:
def find_words(mail):
    words = []
    for word in mail.split():
        if word not in words:
            words.append(word)
    return words

In [31]:
## Probability of spam & ham
hamlen = len(y_train[y_train=="ham"])
spamlen = len(y_train[y_train=="spam"])
p_ham = hamlen/(hamlen+spamlen)
p_spam = spamlen/(hamlen+spamlen)
print(p_ham,p_spam)

0.8736871199557766 0.12631288004422334


In [32]:
## Probability of all words
p_all_words = {}
for k1,v1 in freq_spam.items():
        p_all_words[k1] = v1/total_all_words
        

In [33]:
## Probability of word in spam list 
p_word_in_spam = {}
for k1,v1 in freq_spam.items():
        p_word_in_spam[k1] = v1/total_spam
    

In [34]:
## Probability of word in spam list 
p_word_in_ham = {}
for k1,v1 in freq_ham.items():
        p_word_in_ham[k1] = v1/total_ham


In [35]:
def naive_bayes_prob_spam(mail):
    words = find_words(mail)
    prob = 1
    p_word = 1
    for word in words:
        if word in p_word_in_spam:
            prob*=p_word_in_spam[word]
        else: 
            prob*=0.000000000000001
            
        if word in p_all_words:
            p_word*= p_all_words[word]
        else:
            p_word*=1
    return (prob*p_spam)/p_word

In [36]:
def naive_bayes_prob_ham(mail):
    words = find_words(mail)
    prob = 1
    p_word = 1
    for word in words:
        if word in p_word_in_ham:
            prob*=p_word_in_ham[word]
        else: 
            prob*=0.000000000000001
            
        if word in p_all_words:
            p_word*= p_all_words[word]
        else:
            p_word*=1
    return (prob*p_ham)/p_word

In [37]:
def predicted_prob(mail):
    prob_being_ham = naive_bayes_prob_ham(mail)
    prob_being_spam = naive_bayes_prob_spam(mail)
    return prob_being_spam,prob_being_ham

## Testing the modal

In [38]:
for i,test_mail in enumerate(X_test):
    pred_prob_spam,pred_prob_ham = predicted_prob(test_mail)
    if pred_prob_spam>=pred_prob_ham:
        print(test_mail)
        print("Predicted spam with prob:",pred_prob_spam)
        print("Actual y:",y_test[i])

Free tones Hope you enjoyed your new content. text stop to 61610 to unsubscribe. help:08712400602450p Provided by tones2you.co.uk
Predicted spam with prob: 163490898655.38776
Actual y: spam
18 days to Euro2004 kickoff! U will be kept informed of all the latest news and results daily. Unsubscribe send GET EURO STOP to 83222.
Predicted spam with prob: 4.349508286641213e-63
Actual y: spam
Todays Voda numbers ending 7548 are selected to receive a $350 award. If you have a match please call 08712300220 quoting claim code 4041 standard rates app
Predicted spam with prob: 3.078458658924416e-30
Actual y: spam
PRIVATE! Your 2003 Account Statement for shows 800 un-redeemed S. I. M. points. Call 08718738002 Identifier Code: 48922 Expires 21/11/04
Predicted spam with prob: 1.6349089865538807e-34
Actual y: spam
Hi. Customer Loyalty Offer:The NEW Nokia6650 Mobile from ONLY å£10 at TXTAUCTION! Txt word: START to No: 81151 & get yours Now! 4T&Ctxt TC 150p/MTmsg
Predicted spam with prob: 15.87839555040

## Finding accuracy

In [39]:
c = 0
for i,test_mail in enumerate(X_test):
    pred_prob_spam,pred_prob_ham = predicted_prob(test_mail)
    if pred_prob_spam>=pred_prob_ham and y_test[i]=="spam":
        c=c+1
    elif pred_prob_spam<=pred_prob_ham and y_test[i]=="ham":
        c=c+1
print(c/len(y_test))

0.9761444229529336


## Create a confusion matrix on our prediction for test dataset

In [40]:
#True positives
#Assuming spam = 1 and ham = 0
tp,tn, fp,fn=0,0,0,0
for i,test_mail in enumerate(X_test):
    pred_prob_spam,pred_prob_ham = predicted_prob(test_mail)
    if pred_prob_spam>=pred_prob_ham and y_test[i]=="spam":
        tp=tp+1
    elif pred_prob_spam<=pred_prob_ham and y_test[i]=="ham":
        tn=tn+1
    elif pred_prob_spam>=pred_prob_ham and y_test[i]=="ham":
        fp=fp+1
    elif pred_prob_spam<=pred_prob_ham and y_test[i]=="spam":
        fn=fn+1
pd.DataFrame(data = [[tp,fn],[fp,tn]],index=["Actual Yes", "Actual No"], columns=["Predicted Yes", "Predicted No"])

Unnamed: 0,Predicted Yes,Predicted No
Actual Yes,162,34
Actual No,3,1352


In [41]:
recall = tp/(tp+fn)
precision = tp/(tp+fp)
f1_score = 2*precision*recall/(precision+recall)
print("Recall:", recall)
print("Precision:", precision)
print("F1 score:", f1_score)

Recall: 0.826530612244898
Precision: 0.9818181818181818
F1 score: 0.8975069252077562
