In [2]:
# author : ujwol dahal


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import re
import time

In [3]:
start_time = time.perf_counter()
data = pd.read_csv('spam-ham-data.csv', index_col = None)
data.head()
data.shape

(5572, 2)

In [4]:
#our dataframe holds  columns one for message and next for the  category. .
#the next step will be data cleaning.
def swap_columns(df, col1, col2):
    col_list = list(df.columns)
    x, y = col_list.index(col1), col_list.index(col2)
    col_list[y], col_list[x] = col_list[x], col_list[y]
    df = df[col_list]
    return df
data = swap_columns(data,'Category','Message')
data


Unnamed: 0,Message,Category
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,spam
5568,Will ü b going to esplanade fr home?,ham
5569,"Pity, * was in mood for that. So...any other s...",ham
5570,The guy did some bitching but I acted like i'd...,ham


In [5]:
data['Category'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: Category, dtype: float64

In [6]:
#removing punctuations:
from os import remove
import string
def remove_punctuations(text):
    for punctuation in string. punctuation:
        text = text. replace(punctuation, '')
    return text
data['Message'] = data['Message'].apply(remove_punctuations)

#lowercasing all the Messages and labels.
data= data.applymap(lambda s:s.lower() if type(s) == str else s)
data['Message'] = data['Message'].str.replace(
   '\W', ' ')


  data['Message'] = data['Message'].str.replace(


In [7]:
# creating vocabulary for the data:
data_bag = data['Message'].str.split()

vocabulary = []
for sentence in data_bag:
    for word in sentence:
        vocabulary.append(word)
vocabulary = list(set(vocabulary))
total_words = len(vocabulary) 


In [8]:
#splitting dataframe into test set and train set:

#shuffling the dataset to avoid bias or variance
data = data.sample(frac=1)
training_data = data.iloc[:round((0.75)*len(data))].reset_index(drop=True)
test_data = data.iloc[round((0.75)*len(data)):].reset_index(drop=True)

#seeing the proportion of spam and ham messages in our test set.
test_data['Category'].value_counts(normalize=True)

#Considering train_data: here indicies of messages are random and count is 4179 = 75% * 5572
training_data


Unnamed: 0,Message,Category
0,dont gimme that lip caveboy,ham
1,send me the new number,ham
2,kkcongratulation,ham
3,your account has been refilled successfully by...,ham
4,the xmas story is peace the xmas msg is love t...,ham
...,...,...
4174,hey babe my friend had to cancel still up for ...,ham
4175,easy ahsen got selected means its good,ham
4176,hey is rite u put 10 evey mnth is that all,ham
4177,im done c ü there,ham


In [9]:
word_counts = {unique_word: [0] * len(training_data['Message']) for unique_word in vocabulary}

for index, msg in enumerate(training_data['Message']):
   for word in msg:
      if word in [' ','£','’','鈥','〨']:
         continue
      try:
         word_counts[word][index] += 1
      except KeyError:
         pass
transformed_train_data = pd.DataFrame(word_counts)
transformed_train_data = pd.concat([training_data, transformed_train_data], axis=1)
transformed_train_data


Unnamed: 0,Message,Category,welltake,ettans,playi,600,football,816183,ending,missin,...,foregate,hut,tip,workout,remet,w45wq,gibe,sportsx,carso,upyeh
0,dont gimme that lip caveboy,ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,send me the new number,ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,kkcongratulation,ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,your account has been refilled successfully by...,ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,the xmas story is peace the xmas msg is love t...,ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4174,hey babe my friend had to cancel still up for ...,ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4175,easy ahsen got selected means its good,ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4176,hey is rite u put 10 evey mnth is that all,ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4177,im done c ü there,ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
##
spam_message_df = transformed_train_data.loc[transformed_train_data['Category'] == 'spam']
ham_message_df = transformed_train_data.loc[transformed_train_data['Category'] == 'ham']

#setting up priors:
prior_ham = len(ham_message_df)/len(transformed_train_data)
prior_spam = 1 - prior_ham
spam_message_df

Unnamed: 0,Message,Category,welltake,ettans,playi,600,football,816183,ending,missin,...,foregate,hut,tip,workout,remet,w45wq,gibe,sportsx,carso,upyeh
8,hey boys want hot xxx pics sent direct 2 ur ph...,spam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,themobyo yo yohere comes a new selection of ho...,spam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,oh my god ive found your number again im so gl...,spam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20,valentines day special win over 1000 in our q...,spam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24,got what it takes 2 take part in the wrc rally...,spam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4105,hello darling how are you today i would love t...,spam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4115,gent we are trying to contact you last weekend...,spam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4118,email alertfrom jeri stewartsize 2kbsubject lo...,spam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4128,urgent your mobile no 07xxxxxxxxx won a 2000 ...,spam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
#finding total number of token present in spam and ham messages.
total_spam_words = spam_message_df['Message'].apply(len).sum()
total_ham_words = ham_message_df['Message'].apply(len).sum()


In [12]:
#Now calculating class probabilities
probability_spam = {}
probability_ham = {}
alpha = 1
for term in vocabulary:
    occurrence_in_spam = spam_message_df[term].sum()
    p_spam = (occurrence_in_spam + alpha)/(total_spam_words + total_words)
    probability_spam[term] = p_spam

    occurrence_in_ham = ham_message_df[term].sum()
    p_ham = (occurrence_in_ham+alpha)/(total_ham_words + total_words)
    probability_ham[term] = p_ham



In [13]:
#now all words in the vocabulary have associated probabilities of spam or ham in the probability dictionary above.


In [14]:
test_data[test_data['Category'] == 'spam']

Unnamed: 0,Message,Category
3,urgent we are trying to contact you last weeke...,spam
6,natalja 25f is inviting you to be her friend r...,spam
20,09066362231 urgent your mobile no 07xxxxxxxxx ...,spam
32,urgent this is the 2nd attempt to contact uu h...,spam
34,money i have won wining number 946 wot do i do...,spam
...,...,...
1348,u have a secret admirer who is looking 2 make ...,spam
1372,sms ac jsco energy is high but u may not know ...,spam
1374,congrats 1 year special cinema pass for 2 is y...,spam
1376,someone has contacted our dating service and e...,spam


In [15]:
message_to_classify = test_data['Message'].values.tolist()
desired_labels = test_data['Category'].values.tolist()

In [16]:
def spam_or_ham(message):
    pS = pH = 1
    for word in message:
        if word in vocabulary:
            pS = pS * probability_spam[word]
            pH = pH * probability_ham[word]
    if pS > pH:
        return 'spam'
    else:
        return 'ham'

In [17]:
label = []
for message in message_to_classify:
    label.append(spam_or_ham(message))

In [18]:
def compute_accuracy(Y_true, Y_pred):  
    correctly_predicted = 0  
    # iterating over every label and checking it with the true sample  
    for true_label, predicted in zip(Y_true, Y_pred):  
        if true_label == predicted:  
            correctly_predicted += 1  
    # computing the accuracy score  
    accuracy_score = correctly_predicted / len(Y_true)  
    return accuracy_score  



In [19]:
#the accuracy of our model then is:
accuracy= compute_accuracy(desired_labels,label)

In [20]:
print(f"The accuracy of the multinomial text classification is \t {accuracy}\n\n")
end_time = time.perf_counter()

print(f"The total time for script execution along with testing is \t {end_time-start_time} ***")

The accuracy of the multinomial text classification is 	 0.9669777458722182


The total time for script execution along with testing is 	 18.66448579999269 ***


In [22]:
confusion_matrix(desired_labels,label)

array([[1193,   21],
       [  25,  154]], dtype=int64)