## Notebook Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Constants

In [2]:
TOKEN_SPAM_PROB_FILE = 'SpamData/03_Testing/prob-spam.txt'
TOKEN_HAM_PROB_FILE = 'SpamData/03_Testing/prob-nonspam.txt'
TOKEN_ALL_PROB_FILE = 'SpamData/03_Testing/prob-all-tokens.txt'

TEST_FEATURE_MATRIX = 'SpamData/03_Testing/test-feature.txt'
TEST_TARGET_FILE = 'SpamData/03_Testing/test-target.txt'

VOCAB_SIZE = 2500

## Load the Data

In [3]:
# Features
X_test = np.loadtxt(TEST_FEATURE_MATRIX, delimiter=' ')

# Target
y_test = np.loadtxt(TEST_TARGET_FILE, delimiter=' ')

# Token probabilities
prob_token_spam = np.loadtxt(TOKEN_SPAM_PROB_FILE, delimiter=' ')
prob_token_ham = np.loadtxt(TOKEN_HAM_PROB_FILE, delimiter=' ')
prob_all_tokens = np.loadtxt(TOKEN_ALL_PROB_FILE, delimiter=' ')

In [4]:
X_test[:5]

array([[0., 0., 1., ..., 0., 0., 0.],
       [6., 1., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 2., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Calculating the Joint Probability

### The Dot Product

In [5]:
a = np.array([1,2,3])
b = np.array([0,5,4])
print('a = ', a)
print('b = ', b)

a =  [1 2 3]
b =  [0 5 4]


In [6]:
a.dot(b)

22

In [7]:
1*0 + 2*5 + 3*4

22

In [8]:
c = np.array([[0, 6], [3, 0], [5, 1]])
print('Shape of c is', c.shape)
print(c)

Shape of c is (3, 2)
[[0 6]
 [3 0]
 [5 1]]


In [9]:
print(a.dot(c))
print('Shape of dot product is', a.dot(c).shape)

[21  9]
Shape of dot product is (2,)


In [10]:
X_test.shape

(1724, 2500)

In [11]:
prob_token_spam.shape

(2500,)

In [12]:
print('Shape of dot product is', X_test.dot(prob_token_spam).shape)

Shape of dot product is (1724,)


## Set the Prior

$$P(Spam \, | \, X) = \frac{P(X \, | \, Spam \,) \, P(Spam)} {P(X)}$$

In [17]:
PROB_SPAM = 0.3116
PROB_HAM = 1 - PROB_SPAM

In [18]:
np.log(prob_token_spam)

array([-4.42147148, -5.26555906, -5.0005684 , ..., -9.80458944,
       -9.62226789, -9.21680278])

## Joint probability in log format

In [19]:
joint_log_spam = X_test.dot(np.log(prob_token_spam) - np.log(prob_all_tokens)) + np.log(PROB_SPAM)

In [20]:
joint_log_spam[:5]

array([22.33592563,  1.96010191, 17.80251413, 18.20771333, 19.52044099])

$$P(Ham \, | \, X) = \frac{P(X \, | \, Ham \,) \, (1 - P(Spam))} {P(X)}$$

In [21]:
joint_log_ham = X_test.dot(np.log(prob_token_ham) - np.log(prob_all_tokens)) + np.log(PROB_HAM)

In [22]:
joint_log_ham[:5]

array([-58.92961631, -10.85232723, -34.74870535, -59.01385325,
       -53.0736597 ])

In [23]:
joint_log_spam.shape

(1724,)

## Making Predictions

## Checking for the higher joint probability

$$P(Spam \, | \, X) \, > \, P(Ham \, | \, X)$$
<br>
<center><b>OR</b></center>
<br>
$$P(Spam \, | \, X) \, < \, P(Ham \, | \, X)$$

In [24]:
prediction = joint_log_spam > joint_log_ham

In [25]:
prediction[-5:]

array([ True, False, False, False, False])

In [27]:
y_test[-5:]

array([0., 0., 0., 0., 0.])

## Simplification

$$P(X \, | \, Spam) \, P(Spam) ≠ \frac{P(X \, | \, Spam) \, P(Spam)}{P(X)}$$

In [28]:
joint_log_spam = X_test.dot(np.log(prob_token_spam)) + np.log(PROB_SPAM)
joint_log_ham = X_test.dot(np.log(prob_token_ham)) + np.log(PROB_HAM)