In [None]:
# Uncomment the line below and enter path to the dataset
# data_path = ''

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [None]:
import numpy as np
import pandas as pd

In [None]:
#reading data
data = pd.read_csv(data_path, sep='\t', names=['Label', 'SMS'])
data.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#data cleaning (similar to Multinomial NB classifier)

# remove puntuation
data['SMS'] = data['SMS'].str.replace('[^\w\s]','', regex=True)  # \w non-word character, \s space or tab
data.head()

# convert to lower case
data['SMS'] = data['SMS'].str.lower()

# split into separate words
data['SMS'] = data['SMS'].str.split()

data.head()

Unnamed: 0,Label,SMS
0,ham,"[go, until, jurong, point, crazy, available, o..."
1,ham,"[ok, lar, joking, wif, u, oni]"
2,spam,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"[nah, i, dont, think, he, goes, to, usf, he, l..."


In [None]:
#Split data - train/test (Similar to Multinomial NB classifier)

# Randomize the dataset
randomized_data = data.sample(frac=1, random_state=1)

# Calculate index for split
training_test_index = round(len(randomized_data) * 0.8)

# Split into training and testing sets
training_data = randomized_data[:training_test_index].reset_index(drop=True)
testing_data = randomized_data[training_test_index:].reset_index(drop=True)

print("Training Data:",training_data.shape)
print("Testing Data:",testing_data.shape)

Training Data: (4458, 2)
Testing Data: (1114, 2)


In [None]:
# vocabulary as a list of all words
vocabulary = list(set(data['SMS'].sum()))

# dictionary: each key-value pair will correspond to a dataframe column
word_counts_per_sms = {unique_word: [0] * len(training_data['SMS']) for unique_word in vocabulary}

# Major difference between Bernoulli implementation and Multinomial implementation is the way we construct the P_k,xi  matrix for each class and each feature
# In Bernoulli, we check if the feature has occured in a given sample whereas in Multinomial we check how many times the feature has occured in a given sample
# This is because in Bernoulli P_k,xi = (#1s in columns)/(number of rows) and in Multinomial P_k,xi = (Sum of event i occurences)/(Sum of table)
for index, sms in enumerate(training_data['SMS']):
   for word in sms:
     # only need to check if the word has occurred in the sample
     word_counts_per_sms[word][index] = 1

word_counts = pd.DataFrame(word_counts_per_sms)

#Combining Word count with training data
training_data_clean = pd.concat([training_data, word_counts], axis=1)
training_data_clean.head()

Unnamed: 0,Label,SMS,challenging,hospitals,regular,snowboarding,11,rebooting,male,worldgnun,...,newest,83110,aspects,influx,admiti,sufficient,knew,peaceful,like,weaknesses
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, theres, a, c...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#Building the Bernoulli NB

# Isolating spam and ham messages
spam_messages = training_data_clean[training_data_clean['Label'] == 'spam']
ham_messages = training_data_clean[training_data_clean['Label'] == 'ham']

# Computing total number of samples in each class
n_spam = len(spam_messages)
n_ham = len(ham_messages)

# Compute Prior: P(Spam) and P(Ham)
p_spam = n_spam / len(training_data_clean)
p_ham = n_ham / len(training_data_clean)

# Laplace smoothing
alpha = 1

# Compute likelihood parameters
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

# Constructing the P_k,xi matrix
for word in vocabulary:
   n_word_given_spam = spam_messages[word].sum() # spam_messages already defined
   p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + 2)
   parameters_spam[word] = p_word_given_spam

   n_word_given_ham = ham_messages[word].sum() # ham_messages already defined
   p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + 2)
   parameters_ham[word] = p_word_given_ham

In [None]:
# classify function
def classify(message):
  p_spam_given_message = (p_spam)   # spam prior
  p_ham_given_message = (p_ham)     # ham prior

# the way probability is calculated in Bernoulli is different than Multinomial. Here, we consider all the words in vocabulary
# and calculate the probability for p_spam_given_message and p_ham_given_message
  for er1 in vocabulary:
    if er1 in message:
      p_spam_given_message *= (parameters_spam[er1])
    else:
      p_spam_given_message *= (1-(parameters_spam[er1]))

  for er2 in vocabulary:
    if er2 in message:
      p_ham_given_message *= (parameters_ham[er2])
    else:
      p_ham_given_message *= (1-(parameters_ham[er2]))


#M_N concept for success and failuer probability above
  if p_ham_given_message>= p_spam_given_message:
    return 'ham'
  else:
    return 'spam'

In [None]:
# testing data
testing_data['Predicted'] = testing_data['SMS'].apply(classify)
testing_data['Correct'] = (testing_data['Predicted'] == testing_data['Label']).astype(int)
testing_data.head()

# measure accuracy
correct = testing_data['Correct'].sum()
total = testing_data.shape[0]

print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)

Correct: 1089
Incorrect: 25
Accuracy: 0.9775583482944344
