In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import random

from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [2]:
train_feature = pd.read_csv("spam_polluted/train_feature.txt", delim_whitespace=True, header=None) 
test_feature = pd.read_csv("spam_polluted/test_feature.txt", delim_whitespace=True, header=None) 
train_label = pd.read_csv("spam_polluted/train_label.txt", delim_whitespace=True, header=None) 
y_test = pd.read_csv("spam_polluted/test_label.txt", delim_whitespace=True, header=None) 

In [3]:
WholeDf = pd.concat([train_feature, test_feature])

In [4]:
WholeDf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1047,1048,1049,1050,1051,1052,1053,1054,1055,1056
0,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.041051,0.068515,0.014376,0.068351,0.032469,0.014087,0.034152,0.051189,0.063388,0.043658
1,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.062413,0.050598,0.071449,0.034827,0.037157,0.051147,0.067859,0.05222,0.004742,0.009583
2,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.019171,0.045824,0.011757,0.03153,0.03275,0.073789,0.0459,0.07275,0.040348,0.029986
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.010594,0.013711,0.054859,0.004493,0.053976,0.029885,0.063413,0.070903,0.02612,0.008427
4,0.0,0.0,0.0,0.0,1.85,0.0,0.0,1.85,0.0,0.0,...,0.055207,0.071441,0.058734,0.058241,0.034914,0.018111,0.019574,0.009803,0.065727,0.058667


In [5]:
pca = PCA(n_components=101)

In [6]:
principalComponents = pca.fit_transform(WholeDf)
principalDf = pd.DataFrame(data = principalComponents)

In [7]:
principalDf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,742.633046,-74.001388,-0.568021,-1.23888,0.584801,-0.142957,-0.20346,1.893682,0.499048,-0.054548,...,-0.027573,0.181044,0.095589,0.121102,-0.132679,0.058079,0.009815,0.030656,0.016074,0.230822
1,2019.854515,102.077282,-23.77719,0.14438,-0.382718,-0.156707,-0.436914,-0.252668,-0.306026,-0.231057,...,-0.26962,0.116605,-0.292031,-0.014378,0.010521,0.0933,0.079933,0.033213,0.185213,0.057727
2,-93.047269,3.008532,-1.043626,-1.761758,0.806475,-0.363198,0.145956,1.253429,0.217177,-0.039102,...,0.182783,0.10589,-0.072829,0.013142,-0.023765,0.064243,0.157913,0.204409,-0.060775,-0.036216
3,-93.047352,3.008512,-1.044157,-1.618446,0.828785,-0.31529,-0.025347,1.236253,0.214051,-0.0325,...,-0.208732,-0.150122,-0.134568,0.06609,-0.133646,-0.030801,0.027016,0.030771,0.05559,0.002208
4,-232.290285,0.835958,-0.049379,-1.69411,0.530321,-0.23403,0.456226,-1.350935,-1.315274,-0.169406,...,0.086954,-0.177824,-0.049839,-0.12271,-0.02768,-0.07074,0.093258,-0.199852,-0.086048,-0.090583


In [8]:
X_train = principalDf.iloc[0:4140]
X_test = principalDf.iloc[4140:]

In [9]:
y_train = train_label.rename(columns={0: "label"})

In [10]:
finalDf = pd.concat([X_train, y_train], axis = 1)

In [11]:
num_spam = finalDf['label'][finalDf['label'] == 1].count()
num_non_spame = finalDf['label'][finalDf['label'] == 0].count()
total = len(finalDf)

print('Spam:',num_spam)
print('Non-spam ',num_non_spame)
print('Total: ',total)

Spam: 1631
Non-spam  2509
Total:  4140


In [12]:
prob_spam = num_spam/total
print('Probability spam: ',prob_spam)

prob_non_spam = num_non_spame/total
print('Probability non-spam: ',prob_non_spam)

Probability spam:  0.3939613526570048
Probability non-spam:  0.6060386473429952


In [13]:

data_mean = finalDf.groupby('label').mean()

data_variance = finalDf.groupby('label').var()*(1/6)


In [14]:
def prob_x_y(x, mean_y, variance_y):
    prob = 1/(np.sqrt(2*np.pi*variance_y)) * np.exp((-(x-mean_y)**2)/(2*variance_y))
    return prob

In [15]:
y_pred = []

In [16]:
for row in range(0,len(X_train)):
        prod_0 = prob_non_spam
        prod_1 = prob_spam
        for col in X_train.columns:   
            prod_0 *= prob_x_y(X_train[col].iloc[row], data_mean[col][0], data_variance[col][0])
            prod_1 *= prob_x_y(X_train[col].iloc[row], data_mean[col][1], data_variance[col][1])
    

        if prod_0 > prod_1:
            y_pred.append(0)
        else:
            y_pred.append(1)


In [17]:
np.array(y_train)

array([[1],
       [1],
       [1],
       ...,
       [0],
       [0],
       [0]])

In [18]:
np.mean(y_pred== y_train)

label    0.828261
dtype: float64