In [47]:
#### downloading spam dataset #####

!wget https://lazyprogrammer.me/course_files/spam.csv


--2023-05-08 22:25:50--  https://lazyprogrammer.me/course_files/spam.csv
Resolving lazyprogrammer.me (lazyprogrammer.me)... 172.67.213.166, 104.21.23.210, 2606:4700:3031::6815:17d2, ...
Connecting to lazyprogrammer.me (lazyprogrammer.me)|172.67.213.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 503663 (492K) [text/csv]
Saving to: ‘spam.csv.2’


2023-05-08 22:25:50 (9.63 MB/s) - ‘spam.csv.2’ saved [503663/503663]



In [22]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [8]:
spam_df = pd.read_csv('spam.csv', encoding = "ISO-8859-1")

In [10]:
spam_df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [15]:
spam_df = spam_df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)

In [17]:
##### all unique labels in the dataset ######
set(spam_df['v1'])


{'ham', 'spam'}

In [48]:
map_dict = {'ham': 0, 'spam': 1}
spam_df['labels'] = spam_df['v1'].map(map_dict)

In [19]:
spam_df.head()

Unnamed: 0,v1,v2,labels
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [20]:
input_text = spam_df['v2']
label = spam_df['labels']

In [25]:
##### dividing into test and train #######

train_text, test_text, Ytrain, Ytest = train_test_split(input_text, label)

In [26]:
len(Ytrain), len(Ytest)

(4179, 1393)

In [49]:
##### starting point for word indexing #####
##### we are setting aside 0 for unknown values, i.e. the tokens that are present in 
##### test but not in train. 

idx = 1
word2idx = {'<unk>': 0}



In [28]:


# populate word2idx
for text in train_text:
    tokens = text.split()
    for token in tokens:
      if token not in word2idx:
        word2idx[token] = idx
        idx += 1



In [29]:
len(word2idx)

13086

In [30]:


# convert data into integer format
train_text_int = []
test_text_int = []

for text in train_text:
  tokens = text.split()
  line_as_int = [word2idx[token] for token in tokens]
  train_text_int.append(line_as_int)

for text in test_text:
  tokens = text.split()
  line_as_int = [word2idx.get(token, 0) for token in tokens]
  test_text_int.append(line_as_int)



In [31]:
### looking at sample converted data ###

train_text_int[100:105]

[[176, 114, 123, 924, 32, 236],
 [925,
  59,
  58,
  549,
  926,
  136,
  9,
  927,
  928,
  929,
  206,
  930,
  931,
  220,
  6,
  58,
  814,
  932,
  14,
  104,
  933,
  24,
  311,
  934,
  51,
  935,
  644,
  936,
  32,
  937,
  51,
  938],
 [939, 690, 14, 181, 940],
 [140, 166, 941, 181, 942, 32, 943, 944, 945],
 [620, 7, 26, 946, 9, 947, 948, 351, 502, 51]]

In [32]:
# initialize A and pi matrices - for both classes. The number of A and pi matrices depends on the number of classes or categories we have #
V = len(word2idx)

A0 = np.ones((V, V))
pi0 = np.ones(V)

A1 = np.ones((V, V))
pi1 = np.ones(V)

In [33]:


# compute counts for A and pi
def compute_counts(text_as_int, A, pi):
  for tokens in text_as_int:
    last_idx = None
    for idx in tokens:
      if last_idx is None:
        # it's the first word in a sentence
        pi[idx] += 1
      else:
        # the last word exists, so count a transition
        A[last_idx, idx] += 1

      # update last idx
      last_idx = idx


compute_counts([t for t, y in zip(train_text_int, Ytrain) if y == 0], A0, pi0)
compute_counts([t for t, y in zip(train_text_int, Ytrain) if y == 1], A1, pi1)



In [34]:


# normalize A and pi so they are valid probability matrices
# convince yourself that this is equivalent to the formulas shown before
A0 /= A0.sum(axis=1, keepdims=True)
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
pi1 /= pi1.sum()



In [35]:


# log A and pi since we don't need the actual probs
logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)



In [36]:
# compute priors for both categories #


count0 = sum(y == 0 for y in Ytrain)
count1 = sum(y == 1 for y in Ytrain)
total = len(Ytrain)
p0 = count0 / total
p1 = count1 / total
logp0 = np.log(p0)
logp1 = np.log(p1)
p0, p1

(0.8676716917922948, 0.1323283082077052)

In [37]:


# build a classifier
class Classifier:
  def __init__(self, logAs, logpis, logpriors):
    self.logAs = logAs
    self.logpis = logpis
    self.logpriors = logpriors
    self.K = len(logpriors) # number of classes

  def _compute_log_likelihood(self, input_, class_):
    logA = self.logAs[class_]
    logpi = self.logpis[class_]

    last_idx = None
    logprob = 0
    for idx in input_:
      if last_idx is None:
        # it's the first token
        logprob += logpi[idx]
      else:
        logprob += logA[last_idx, idx]
      
      # update last_idx
      last_idx = idx
    
    return logprob
  
  def predict(self, inputs):
    predictions = np.zeros(len(inputs))
    for i, input_ in enumerate(inputs):
      posteriors = [self._compute_log_likelihood(input_, c) + self.logpriors[c] \
             for c in range(self.K)]
      pred = np.argmax(posteriors)
      predictions[i] = pred
    return predictions



In [50]:
# each array must be in order since classes are assumed to index these lists
clf = Classifier([logA0, logA1], [logpi0, logpi1], [logp0, logp1])

In [39]:
Ptrain = clf.predict(train_text_int)
print(f"Train acc: {np.mean(Ptrain == Ytrain)}")

Train acc: 0.9978463747307968


In [40]:


Ptest = clf.predict(test_text_int)
print(f"Test acc: {np.mean(Ptest == Ytest)}")



Test acc: 0.9576453697056713


In [41]:
from sklearn.metrics import confusion_matrix, f1_score

In [42]:
cm = confusion_matrix(Ytrain, Ptrain)
cm

array([[3626,    0],
       [   9,  544]])

In [43]:
cm = confusion_matrix(Ytest, Ptest)
cm

array([[1199,    0],
       [  59,  135]])

In [44]:
f1_score(Ytrain, Ptrain)

0.99179580674567

In [45]:
f1_score(Ytest, Ptest)

0.8206686930091185