In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string

In [3]:
df = pd.read_csv("spam.csv",encoding="ISO-8859-1")

In [4]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
df.rename(columns={"v1":"spam","v2":"text"},inplace=True)

In [16]:
df.shape

(5572, 5)

In [7]:
df.head()

Unnamed: 0,spam,text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [11]:
df["spam"].value_counts()

ham     4825
spam     747
Name: spam, dtype: int64

In [12]:
df["spam"] = df.spam.apply(lambda s:True if s=="spam" else False)

In [14]:
df['spam'].value_counts()

False    4825
True      747
Name: spam, dtype: int64

In [18]:
df["text"].unique()

array(['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
       'Ok lar... Joking wif u oni...',
       "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
       ..., 'Pity, * was in mood for that. So...any other suggestions?',
       "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free",
       'Rofl. Its true to its name'], dtype=object)

In [19]:
df["text"] = df.text.apply(lambda t: t.lower().translate(str.maketrans("","",string.punctuation)))

In [21]:
df["text"].head()

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in 2 a wkly comp to win fa cup fina...
3          u dun say so early hor u c already then say
4    nah i dont think he goes to usf he lives aroun...
Name: text, dtype: object

In [22]:
df.head()

Unnamed: 0,spam,text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,False,go until jurong point crazy available only in ...,,,
1,False,ok lar joking wif u oni,,,
2,True,free entry in 2 a wkly comp to win fa cup fina...,,,
3,False,u dun say so early hor u c already then say,,,
4,False,nah i dont think he goes to usf he lives aroun...,,,


In [23]:
df = df.sample(frac=1)

In [24]:
df.head()

Unnamed: 0,spam,text,Unnamed: 2,Unnamed: 3,Unnamed: 4
1118,False,im going 4 lunch now wif my family then aft da...,,,
1633,False,yesterday its with me only now am going home,,,
2264,False,ok now i am in bus if i come soon i will co...,,,
2396,False,well you told others youd marry them,,,
787,True,ever thought about living a good life with a p...,,,


In [27]:
df.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis=1,inplace=True)

In [28]:
df.head()

Unnamed: 0,spam,text
1118,False,im going 4 lunch now wif my family then aft da...
1633,False,yesterday its with me only now am going home
2264,False,ok now i am in bus if i come soon i will co...
2396,False,well you told others youd marry them
787,True,ever thought about living a good life with a p...


In [31]:
train_df = df.iloc[:int(len(df)*0.7)]
test_df = df.iloc[int(len(df)*0.7):]

In [47]:
fraction_spam_text = train_df = train_df.spam.mean()
fraction_spam_text

0.12846153846153846

In [34]:
train_df.shape,test_df.shape

((3900, 2), (1672, 2))

In [36]:
train_spam_words = "".join(train_df[train_df["spam"]==True].text).split(" ")
train_not_spam_words = "".join(train_df[train_df["spam"]==False].text).split(" ")

In [38]:
common_words = set(train_spam_words).intersection(set(train_not_spam_words))

In [41]:
#common_words

In [45]:
train_spam_bow = dict()
for i in common_words:
    train_spam_bow[i] = train_spam_words.count(i) / len(train_spam_words)

train_non_spam_bow = dict()
for i in common_words:
    train_non_spam_bow[i] = train_not_spam_words.count(i) / len(train_not_spam_words)

In [53]:
def predict_text(t,verbose=False):
    valid_words = [w for w in t if w in train_spam_bow]
    
    spam_probs = [train_spam_bow[w] for w in valid_words]
    not_spam_probs = [train_non_spam_bow[w] for w in valid_words]
    
    if verbose:
        data_df = pd.DataFrame()
        data_df["word"] = valid_words
        data_df["spam_prob"] = spam_probs
        data_df["not_spam_prob"] = not_spam_probs
        data_df["ratio"] = [s/n if n>0 else np.inf for s,n in zip(spam_probs,not_spam_probs)]
        print(data_df)
    
    spam_score = sum(np.log(p) for p in spam_probs) + np.log(fraction_spam_text)
    not_spam_score = sum(np.log(p) for p in not_spam_probs) + np.log(1-fraction_spam_text)
    
    if verbose:
        print(f"Spam score {spam_score}")
        print(f"Not Spam score {not_spam_score}")
    
    return (spam_score >= not_spam_score)

In [54]:
predict_text("urgent call this number".split(),verbose=True)

     word  spam_prob  not_spam_prob      ratio
0  urgent   0.000703       0.000022  31.766464
1    call   0.019344       0.003189   6.066512
2    this   0.004572       0.003432   1.332142
3  number   0.002022       0.000908   2.227526
Spam score -24.848308325227876
Not Spam score -29.28255287227646


True

In [55]:
predict_text("hey do you want to hang out tonight".split(),verbose=True)

      word  spam_prob  not_spam_prob     ratio
0      hey   0.000176       0.000310  0.567258
1       do   0.000703       0.005182  0.135754
2      you   0.012310       0.026063  0.472314
3     want   0.000967       0.002236  0.432464
4       to   0.040183       0.024114  1.666354
5      out   0.002638       0.003144  0.838903
6  tonight   0.000088       0.000399  0.220600
Spam score -47.78710037012017
Not Spam score -40.54382740276193


False

In [56]:
predict_text("win 200 rupees".split(),verbose=True)

  word  spam_prob  not_spam_prob      ratio
0  win    0.00255       0.000066  38.384478
Spam score -8.023827305230844
Not Spam score -9.756850012017109


True

In [57]:
predictions = test_df.text.apply(lambda t: predict_text(t.split()))