# Classification of messages as spam or not spam using Naive Bayes algorithm 

In [1]:
import pandas as pd

In [2]:
# Import Dataset
df = pd.read_table('SMS', sep='\t', header=None,names=['label', 'sms_message'])
df

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
# map the 'ham' value to 0 and the 'spam' value to 1.
df['categories'] = df.label.map({'ham':0,'spam':1})

In [4]:
# Randomize the dataset
df = df.sample(frac=1, random_state=1)
df

Unnamed: 0,label,sms_message,categories
1078,ham,"Yep, by the pretty sculpture",0
4028,ham,"Yes, princess. Are you going to make me moan?",0
958,ham,Welp apparently he retired,0
4642,ham,Havent.,0
4674,ham,I forgot 2 ask ü all smth.. There's a card on ...,0
...,...,...,...
905,ham,"We're all getting worried over here, derek and...",0
5192,ham,Oh oh... Den muz change plan liao... Go back h...,0
3980,ham,CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C...,0
235,spam,Text & meet someone sexy today. U can find a d...,1


In [5]:
# Split into training and test sets
training_test_index = round(len(df) * 0.8)

training = df[:training_test_index].reset_index(drop=True)
test = df[training_test_index:].reset_index(drop=True)

print(training.shape)
print(test.shape)

(4458, 3)
(1114, 3)


In [6]:
#  data cleaning 
training['sms_message'] = training['sms_message'].str.replace('\W', ' ') # Removes punctuation
training['sms_message'] = training['sms_message'].str.lower() ### making all the words lowercase

test['sms_message'] = test['sms_message'].str.replace('\W', ' ') # Removes punctuation
test['sms_message'] = test['sms_message'].str.lower() ### making all the words lowercase
training 

  training['sms_message'] = training['sms_message'].str.replace('\W', ' ') # Removes punctuation
  test['sms_message'] = test['sms_message'].str.replace('\W', ' ') # Removes punctuation


Unnamed: 0,label,sms_message,categories
0,ham,yep by the pretty sculpture,0
1,ham,yes princess are you going to make me moan,0
2,ham,welp apparently he retired,0
3,ham,havent,0
4,ham,i forgot 2 ask ü all smth there s a card on ...,0
...,...,...,...
4453,ham,sorry i ll call later in meeting any thing re...,0
4454,ham,babe i fucking love you too you know fuck...,0
4455,spam,u ve been selected to stay in 1 of 250 top bri...,1
4456,ham,hello my boytoy geeee i miss you already a...,0


In [7]:
### creating vocabulary from training data
training['sms_message'] = training['sms_message'].str.split()
vocabulary = []
for sms in training['sms_message']:
   for word in sms:
      vocabulary.append(word)
vocabulary = list(set(vocabulary))  ### only count the number of unique words
print(len(vocabulary))

7783


In [8]:
word_counts_per_sms = {unique_word: [0] * len(training['sms_message']) for unique_word in vocabulary}

for index, sms in enumerate(training['sms_message']):
   for word in sms:
      word_counts_per_sms[word][index] += 1
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts

Unnamed: 0,keluviri,mu,update_now,bollox,08712400200,topic,find,again,cheers,unmits,...,hubby,viva,oranges,excuse,contents,2ez,1b6a5ecef91ff9,f,ros,buns
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4456,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
training_new = pd.concat([training, word_counts], axis=1)
training_new.head()

Unnamed: 0,label,sms_message,categories,keluviri,mu,update_now,bollox,08712400200,topic,find,...,hubby,viva,oranges,excuse,contents,2ez,1b6a5ecef91ff9,f,ros,buns
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Laplace smoothing
alpha = 1

# You will start from here.

In [11]:
def count_Spam(label): #how many spam messages
    count = 0
    for x in training_new["label"]:
        if x == label:
            count += 1
    return count

In [12]:
def N_words (word, label): # number of occurences of a given word given a label
    count = 0
    for x in range(0, len(training_new)):
        if training_new["label"][x] == label:
            if training_new[word][x] > 0:
                count += 1
    return count

In [13]:
def word_Prob(word): #probability a word is spam
    numSpam = N_words(word, "spam")
    numHam = N_words(word, "ham")
    return numSpam / (numSpam+numHam)

In [14]:
def sentence_Prob(sentence):  #finds the probability a sentence is spam
    probs = 1
    for word in sentence:
        probs *= word_Prob(word)
    return probs

In [15]:
list = [i for i in training_new["sms_message"]]

In [None]:
test['predicted'] = [sentence_Prob(i) for i in list ]

# Calculate accuracy, precision, recall and F1_score. 

In [None]:
# Model Evaluation 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print('Accuracy score: {}'.format(accuracy_score(test['categories'], test['predicted'])))
print('Precision score: {}'.format(precision_score(test['categories'], test['predicted'])))
print('Recall score: {}'.format(recall_score(test['categories'], test['predicted'])))
print('F1 score: {}'.format(f1_score(test['categories'], test['predicted'])))