In [269]:
import numpy as np
import pandas as pda
from sklearn.model_selection import train_test_split as tts
import re
import warnings
warnings.simplefilter('ignore')

In [271]:
Msg = pda.read_csv('/SMSSpamCollection',sep = '\t', header = None, names=['spam/ham','SMS'])

In [272]:
def Training_Test_set(Msg):
  train_set,test_set = tts(Msg,test_size = 0.2, random_state = 60)
  train_set['SMS'] = train_set['SMS'].str.replace('\W', ' ')
  train_set['SMS'] = train_set['SMS'].str.lower()
  test_set['SMS'] = test_set['SMS'].str.replace('\W', ' ')
  test_set['SMS'] = test_set['SMS'].str.lower()
  return test_set,train_set

In [273]:
test,train=Training_Test_set(Msg)

In [274]:
def make_words(train):
  vocabulary = []
  for msg in train['SMS']:
      for word in msg:
          vocabulary.append(word)
  vocabulary = list(set(vocabulary))
  return vocabulary

In [275]:
train['SMS'] = train['SMS'].str.split()
vocabulary=make_words(train)

In [276]:
wcp_msg = {uniqueWord: [0]*len(train['SMS']) for uniqueWord in vocabulary}
for idx, msg in enumerate(train['SMS']):
    for word in msg:
        wcp_msg[word][idx] +=1
wc = pda.DataFrame(wcp_msg)

In [277]:
final_train = pda.concat([train,wc],axis = 1)

# MULTINOMIAL NAIVE BAYES CLASSIFIER

In [278]:
spamMsgs = final_train[final_train['spam/ham']=='spam']
hamMsgs = final_train[final_train['spam/ham']=='ham']

In [279]:
pSpam = len(spamMsgs)/len(final_train)
# print(pSpam)
pHam = len(hamMsgs)/len(final_train)

In [280]:
noOfWordsPerSpamMsgs = spamMsgs['SMS'].apply(len)
noSpam = noOfWordsPerSpamMsgs.sum()
noOfWordsPerHamMsgs = hamMsgs['SMS'].apply(len)
noHam = noOfWordsPerHamMsgs.sum()
noOfvocabulary = len(vocabulary)
alpha = 1

In [281]:
paramSpam = {uniqueWord: 0 for uniqueWord in vocabulary}
paramHam = {uniqueWord: 0 for uniqueWord in vocabulary}

In [282]:
for word in vocabulary:
    noOfWordsGivenSpam = spamMsgs[word].sum()
    probOfWordsGivenSpam = (noOfWordsGivenSpam + alpha)/(noSpam + alpha*noOfvocabulary)
    paramSpam[word] = probOfWordsGivenSpam
    noOfWordsGivenHam = hamMsgs[word].sum()
    probOfWordsGivenHam = (noOfWordsGivenHam + alpha)/(noHam + alpha*noOfvocabulary)
    paramHam[word] = probOfWordsGivenHam

In [283]:
def classifytest(msg):
    msg = re.sub('\W', ' ', msg)
    msg = msg.lower().split()
    
    probOfSpamGivenMsg = pSpam
    probOfHamGivenMsg = pHam
    
    for word in msg:
        if word in paramSpam:
            probOfSpamGivenMsg *= paramSpam[word]
            
        if word in paramHam:
            probOfHamGivenMsg *= paramHam[word]
            
        if probOfSpamGivenMsg > probOfHamGivenMsg:
            return 'spam'
        elif probOfHamGivenMsg > probOfSpamGivenMsg:
            return 'ham'
        else:
            return 'need human classification'

In [284]:
test['pred'] = test['SMS'].apply(classifytest)

In [285]:
right = 0
tot = test.shape[0]

for row in test.iterrows():
    row = row[1]
    if(row['spam/ham']==row['pred']):
        right +=1
        
print('Correct: ', right)
print('Wrong: ',tot-right)
print('Accuracy: ',right/tot)
print('Error: ', 1 -(right/tot))

Right:  959
Wrong:  156
Accuracy:  0.8600896860986547
Error:  0.13991031390134534


# Gaussian Discriminant Analysis 

In [287]:
trainingY = final_train['spam/ham'].copy()
trainingX = final_train[final_train.columns[2:]]

In [288]:
trainingY = trainingY.to_numpy()
trainingX = trainingX.to_numpy()

In [289]:
X_one = []
X_zero = []

for i in range(len(trainingY)):
    if trainingY[i] == 'spam':
        X_one.append(trainingX[i])
    elif trainingY[i] == 'ham':
        X_zero.append(trainingX[i])

In [290]:
phi = float(len(X_one)/(len(X_one)+len(X_zero)))
mu0 = np.sum(np.matrix(X_zero),axis = 0)/len(X_zero)
mu1 = np.sum(np.matrix(X_one),axis = 0)/len(X_one)
print("Phi = ", phi)
print("mu0 = ", mu0)
print("mu1 = ", mu1)

Phi =  0.13260040385909805
mu0 =  [[nan nan nan ... nan nan nan]]
mu1 =  [[nan nan nan ... nan nan nan]]


In [291]:
sigma = np.zeros((trainingX.shape[1],trainingX.shape[1]))
sigma0 = np.zeros((trainingX.shape[1],trainingX.shape[1]))
sigma1 = np.zeros((trainingX.shape[1],trainingX.shape[1]))


In [None]:
for i in range(len(trainingX)):
   if(trainingY[i] == 'spam'):
       sigma1 += np.dot(np.transpose(trainingX[i]-mu1),trainingX[i]-mu1)
   elif trainingY[i]=='ham':
       sigma0 += np.dot(np.transpose(trainingX[i]-mu0),trainingX[i]-mu0)

In [None]:
sigma = (sigma1 + sigma0)/(len(X_one)+len(X_zero))
sigma0 /= len(X_one)
sigma1 /= len(X_zero)

In [None]:
def probFunction(x,mu,sigma):
   m = len(x)
   if m == mu.shape[1] and (m,m) == sigma.shape:
       deter = np.linalg.det(sigma)
       assert deter!=0, "matrix cannot be singular"
        
       temp = 1.0/(np.power((2*np.pi),float(m)/2)*np.power(deter,1.0/2))
       xmu = np.matrix(x-mu)
       siginv = inv(sigma)
       res = np.power(np.e,-0.5*(np.dot(np.dot(xmu,siginv),np.transpose(xmu))))
       return res*temp

In [None]:
test['SMS'] = test['SMS'].str.split()

vocabulary = []
for msg in test['SMS']:
    for word in msg:
        vocabulary.append(word)
        
vocabulary = list(set(vocabulary))

In [None]:
wcp_msg = {uniqueWord: [0]*len(test['SMS']) for uniqueWord in vocabulary}

for idx, msg in enumerate(test['SMS']):
    for word in msg:
        wcp_msg[word][idx] +=1

In [None]:
wc = pda.DataFrame(wcp_msg)

In [None]:
testFinal = pda.concat([test,wc],axis = 1)

In [None]:
testingY = testFinal['spam/ham'].copy()
testingX = testFinal[testFinal.columns[2:]]

In [None]:
testingY = testingY.to_numpy()
testingX = testingX.to_numpy()

In [None]:
predictedY = []
for x in testingX:
   pa = probFunction(x,np.squeeze(mu0),np.matrix(sigma0))
   pc = probFunction(x,np.squeeze(mu1),np.matrix(sigma1))
   if (pa<pc):
       predictedY.append('spam')
   else:
       predictedY.append('ham')

In [None]:
count = 0
for i in range(len(testingY)):
   if(predictedY[i]==testingY[i]):
       count = count + 1
        
accuracy = np.multiply(np.divide(count,len(testingY)),100)
print(accuracy)