# Notebook Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Constants

In [2]:

TOKEN_SPAM_PROB_FILE = 'SpamData/03_Testing/prob-spam.txt'
TOKEN_HAM_PROB_FILE = 'SpamData/03_Testing/prob-nonspam.txt'
TOKEN_ALL_PROB_FILE = 'SpamData/03_Testing/prob-all-tokens.txt'

TEST_FEATURE_MATRIX = 'SpamData/03_Testing/test-features.txt' 
TEST_TARGET_FILE = 'SpamData/03_Testing/test-target.txt' 

VOCAB_SIZE = 2500

# Load the Data

In [3]:
X_test = np.loadtxt(TEST_FEATURE_MATRIX, delimiter=' ')
y_test = np.loadtxt(TEST_TARGET_FILE, delimiter=' ')

prob_token_spam = np.loadtxt(TOKEN_SPAM_PROB_FILE, delimiter=' ')
prob_token_ham = np.loadtxt(TOKEN_HAM_PROB_FILE, delimiter=' ')
prob_all_tokens = np.loadtxt(TOKEN_ALL_PROB_FILE, delimiter=' ')

# Calculating the Joint Probability

### The Dot Product

In [4]:
X_test.shape

(1724, 2500)

In [5]:
prob_token_spam.shape

(2500,)

In [6]:
X_test.dot(prob_token_spam).shape # I can't do it this way => prob_token_spam.dot(X_test).shape because 
# the values are not aligned [(1724, 2500)] * [(2500,)] != [(2500,)] * [(1724, 2500)]. where the former is
# the correct way of doing it since the col of the first matrix(2500 )is aligned properly with the row of the second matrix(2500 )
# also this is the right way to do it since we are multiplying the column of the 2d array by the row of the 1d array

(1724,)

## Set the Prior

## $$ P(Spam \, | \, X) =  \frac{P(X \, | \, Spam \,) \, P(Spam)} {P(X)}$$

<p> A prior is a belief or guess about a quantity. In our case, we already have a calculated "guess", which is the probability of spam = 0.3109 </p>

In [7]:
PROB_SPAM = 0.3109

In [8]:
np.log(prob_token_spam)

array([ -4.40768141,  -5.25375622,  -4.99015865, ...,  -9.79171765,
        -9.52935339, -10.70800838])

## Joint Probability in Log Format

In [53]:
joint_log_spam = X_test.dot(np.log(prob_token_spam) - np.log(prob_all_tokens)) + np.log(PROB_SPAM)
joint_log_spam[:5]
# since we are using log format instead of dividing by p(X) we'll subtract the prob_token_spam from 
# prob_all_tokens and also we won't be multiplying against the p(Spam) rather we'll be adding it up

array([24.26609396,  2.15741368, 20.58593663, 17.73868148, 20.49790994])

## $$ P(Ham \, | \, X) =  \frac{P(X \, | \, Ham \,) \, (1 - P(Spam) )} {P(X)}$$

In [16]:
joint_log_ham = X_test.dot(np.log(prob_token_ham) - np.log(prob_all_tokens)) + np.log(1 - PROB_SPAM)
joint_log_ham[:5]

array([-60.9615197 , -11.00803242, -37.96485145, -59.12449324,
       -53.78250218])

In [18]:
joint_log_ham.size

1724

In [19]:
joint_log_spam.size

1724

# Making Predictions

### Checking for the Higher Joint Probability

$$ P(Spam \, | \, X) >  P(Ham \, | \, X) $$
<br>
<center><b> OR </b></center>
<br>
$$ P(Spam \, | \, X) <  P(Ham \, | \, X) $$

In [43]:
prediction = []

for i in range(1724):
    if joint_log_spam[i] > joint_log_ham[i]:
        prediction.append(1)
    elif joint_log_ham[i] > joint_log_spam[i]:
        prediction.append(0)


In [44]:
prediction = np.array(prediction)

In [45]:
len(prediction)

1724

In [46]:
prediction[-5:]

array([1, 0, 0, 0, 0])

In [47]:
y_test[-5:]

array([0., 0., 0., 0., 0.])

### Simplifying

In [54]:
joint_log_spam = X_test.dot(np.log(prob_token_spam)) + np.log(PROB_SPAM)
joint_log_ham = X_test.dot(np.log(prob_token_ham)) + np.log(1 - PROB_SPAM)

<p> I can do what I did above because since we are trying to predict  if an email is spam or not , that prediction won't depend on p(X) which was represented by "- np.log(prob_all_tokens)". And so removing that part of the code won't break the calculation </p>