# Naive Bayes - SMS SPAM Detection

In [1]:
import pandas as pd

## Loading Dataset

In [2]:
# Source: https://archive.ics.uci.edu/dataset/228/sms+spam+collection

df = pd.read_csv("../data/sms_spam_collection.csv",
                 delimiter = "\t",
                 header = None,
                 names=["y", "sms"])
df.head()

Unnamed: 0,y,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.shape

(5572, 2)

## Probability of Ham / Spam

In [4]:
df_general = df.groupby(['y']).count().reset_index()
df_general

Unnamed: 0,y,sms
0,ham,4825
1,spam,747


In [5]:
p_spam = 747 / 5572
p_ham = 4825 / 5572

print(p_spam, p_ham)

0.13406317300789664 0.8659368269921034


In [6]:
total_spam = df_general.loc[df_general["y"] == 'spam', 'sms'].values[0]
total_ham = df_general.loc[df_general["y"] == 'ham', 'sms'].values[0]

In [7]:
p_spam = total_spam / df.shape[0]
p_ham = total_ham / df.shape[0]

print(p_spam, p_ham)

0.13406317300789664 0.8659368269921034


## Text Preparation

### Converting to Lower Case

In [8]:
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
df.head()

  df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)


Unnamed: 0,y,sms
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


### Removing Special Characters

In [9]:
df["sms"] = df["sms"].str.replace("[^a-z 0-9]+", "", regex=True)
df.sample(10)

Unnamed: 0,y,sms
3580,ham,multiply the numbers independently and count d...
2754,ham,derp which is worse a dude who always wants to...
5467,spam,get your garden ready for summer with a free s...
926,ham,k wait chikkuil send aftr ltgt mins
640,ham,i had askd u a question some hours before its ...
2938,ham,lol yep did that yesterday already got my fire...
333,spam,call germany for only 1 pence per minute call ...
4340,ham,just got outta class gonna go gym
3011,ham,imagine life without me see how fast u are sea...
1730,ham,lol yeah at this point i guess not


### Tokenizing, Stemming, Lemmatizing, Removing Stopwords

In [10]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords

In [11]:
stops = stopwords.words('english')
print(stops)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [12]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess(sentence):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(lemma_words)

df['sms'] = df['sms'].map(lambda s:preprocess(s)) 

In [13]:
df.sample(10)

Unnamed: 0,y,sms
3792,spam,twink bear scalli skin jock call dont miss wee...
3284,ham,hey tmr mayb meet yck
2851,ham,she fine good hear dear happi new year
5107,ham,realli need kiss miss babi babi 4eva
5172,ham,aight text tonight well see what
326,ham,callsmessagesmiss call
4143,ham,infact happi new year see
5507,ham,want insid everi night
777,ham,dont tell friend your sure want live smoke muc...
4074,ham,actual exam harder nbme


## Retrieving Unique Words

In [32]:
words = df['sms'].str.split().explode().drop_duplicates().values
words

array(['jurong', 'point', 'crazi', ..., 'now1', 'piti', 'soani'],
      dtype=object)

In [33]:
len(words)

7853

### Counting Words

In [34]:
df_count = df.set_index('y')['sms'].str.split().explode().reset_index().groupby(['sms', 'y']).size().to_frame()
df_count = df_count.reset_index()
df_count

Unnamed: 0,sms,y,0
0,008704050406,spam,2
1,0089mi,spam,1
2,0121,spam,1
3,01223585236,spam,1
4,01223585334,spam,2
...,...,...,...
8665,zoe,spam,1
8666,zogtoriu,ham,1
8667,zoom,ham,1
8668,zouk,spam,1


In [35]:
df_count.describe()

Unnamed: 0,0
count,8670.0
mean,5.34579
std,16.90067
min,1.0
25%,1.0
50%,1.0
75%,3.0
max,366.0


In [36]:
df_count[df_count[0] == 366]

Unnamed: 0,sms,y,0
1832,call,spam,366


### Calculating Probabilities

In [37]:
columns=['word', 'prob_ham', 'prob_spam']
df_p = pd.DataFrame(data=None, columns=columns)

for word in words:
    entry = []
    df_word = df_count[df_count["sms"] == word]
    if df_word.shape[0] > 0:
        count_word_spam = df_word.loc[df_word['y'] == 'spam', 0].values
        count_word_ham = df_word.loc[df_word['y'] == 'ham', 0].values
        
        if len(count_word_spam) > 0:
            count_word_spam = count_word_spam[0]
        else:
            count_word_spam = 1
            
        if len(count_word_ham) > 0:
            count_word_ham = count_word_ham[0]
        else:
            count_word_ham = 1
                    
        p_word_ham = count_word_ham / total_ham
        p_word_spam = count_word_spam / total_spam
        
        entry = [word, p_word_ham, p_word_spam]
        
        df_entry = pd.DataFrame(data=[entry], columns=columns)
        if df_p.shape[0] > 0:
            df_p = pd.concat([df_p, df_entry])
        else:
            df_p = df_entry

df_p = df_p.reset_index(drop=True)
df_p.sample(10)

Unnamed: 0,word,prob_ham,prob_spam
1260,jstfrnd,0.000622,0.001339
6968,ger,0.000207,0.001339
4419,box61m60,0.000207,0.001339
7543,sday,0.000207,0.001339
7802,identif,0.000207,0.001339
20,win,0.003938,0.08166
3096,survey,0.000207,0.001339
2752,bleh,0.000415,0.001339
1465,6month,0.000207,0.002677
1393,cashbal,0.000207,0.009371


In [38]:
# these are the main components of the classifier that we need to store
# this is basically the model of naive bayes classifier

classifer_data = {"df": df_p, "p_ham": p_ham, "p_spam": p_spam}

In [39]:
df_p.describe()

Unnamed: 0,prob_ham,prob_spam
count,7852.0,7852.0
mean,0.000972,0.002822
std,0.003375,0.010043
min,0.000207,0.001339
25%,0.000207,0.001339
50%,0.000207,0.001339
75%,0.000415,0.001339
max,0.074611,0.48996


## Testing / Classification

In [40]:
df_sample = df.sample(1).values.tolist()
query = df_sample[0][1]
actual_class = df_sample[0][0]

print(df_sample)

query_words = query.split(' ')

[['ham', 'that good need drug']]


In [46]:
# https://stats.stackexchange.com/questions/66079/naive-bayes-classifier-gives-a-probability-greater-than-1

p_words_ham = 1
p_words_spam = 1

for word in query_words:
    df_p_word = df_p[df_p["word"] == word]

    p_words_ham *= df_p_word['prob_ham'].values[0]
    p_words_spam *= df_p_word['prob_spam'].values[0]

p_words = (p_ham * p_words_ham) + (p_spam * p_words_spam)

p_final_ham = p_words_ham * p_ham / p_words
p_final_spam = p_words_spam * p_spam / p_words

print(f"Ham: {p_final_ham:.5f}, Spam: {p_final_spam:.5f}")

if p_final_ham > p_final_spam:
    classified = 'ham'
else:
    classified = 'spam'

print("")
print(f"Predicted Class: {classified}")
print(f"Actual Class: {actual_class}")

Ham: 0.99941, Spam: 0.00059

Predicted Class: ham
Actual Class: ham
