# Notebook Imports

In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Constants

In [100]:
TRAINING_DATA_FILE = '/content/drive/MyDrive/Machine Learning /Naive Bayes Spam Filter/SpamData/02_Training/train-data.txt'
TEST_DATA_FILE = '/content/drive/MyDrive/Machine Learning /Naive Bayes Spam Filter/SpamData/02_Training/test-data.txt'

TOKEN_SPAM_PROB_FILE = '/content/drive/MyDrive/Machine Learning /Naive Bayes Spam Filter/SpamData/03_Testing/prob-spam.txt'
TOKEN_HAM_PROB_FILE = '/content/drive/MyDrive/Machine Learning /Naive Bayes Spam Filter/SpamData/03_Testing/prob-nonspam.txt'
TOKEN_ALL_PROB_FILE = '/content/drive/MyDrive/Machine Learning /Naive Bayes Spam Filter/SpamData/03_Testing/prob-all.txt'

TEST_FEATURE_MATRIX = '/content/drive/MyDrive/Machine Learning /Naive Bayes Spam Filter/SpamData/03_Testing/test-features.txt'
TEST_TARGET_MATRIX = '/content/drive/MyDrive/Machine Learning /Naive Bayes Spam Filter/SpamData/03_Testing/test-target.txt'

VOCAB_SIZE = 2500

# Read and Load Features from .txt files into Numpy arrays

In [4]:
# load and save training data into numpy arrays

sparse_train_data = np.loadtxt(TRAINING_DATA_FILE, # name of data file
                               delimiter=' ', # character that sets boundaries between plain text data, we are using a single whitespace
                               dtype=int) # type of character, in our case, its integer

In [5]:
sparse_train_data.shape

(260878, 4)

In [6]:
# load and save test data into numpy arrays

sparse_test_data = np.loadtxt(TEST_DATA_FILE, # name of data file
                               delimiter=' ', # character that sets boundaries between plain text data, we are using a single whitespace
                               dtype=int) # type of character, in our case, its integer

My data here is a bit messed up so you can ignore the next few cells where I am fixing the order in my test numpy array

In [7]:
sparse_test_data.shape

(193872, 5)

In [8]:
sparse_train_data[:5]

array([[ 0,  0,  1,  3],
       [ 0,  2,  1,  1],
       [ 0,  6,  1,  1],
       [ 0,  9,  1,  1],
       [ 0, 11,  1,  1]])

In [9]:
sparse_train_data[-5:]

array([[5795,  331,    0,    1],
       [5795,  344,    0,    1],
       [5795,  367,    0,    1],
       [5795,  388,    0,    1],
       [5795,  499,    0,    1]])

In [10]:
sparse_test_data[:5]

array([[   0,    0, 4675,    1, 1099],
       [   1,    0, 4675,    1, 2244],
       [   2,    0, 4675,    1,  138],
       [   3,    0, 4675,    1, 1099],
       [   4,    0, 4675,    1,  893]])

In [11]:
sparse_test_data = np.delete(sparse_test_data, 0, 1)

In [12]:
sparse_test_data

array([[   0, 4675,    1, 1099],
       [   0, 4675,    1, 2244],
       [   0, 4675,    1,  138],
       ...,
       [   0, 4354,    1,    5],
       [   0, 4354,    1,    2],
       [   0, 4354,    1,    0]])

In [13]:
sparse_test_data[:, [0, 1]] = sparse_test_data[:, [1, 0]] 

In [14]:
sparse_test_data[:, [1, 3]] = sparse_test_data[:, [3, 1]]

In [15]:
sparse_test_data[:, [2, 3]] = sparse_test_data[:, [3, 2]]

In [16]:
sparse_test_data

array([[4675, 1099,    0,    1],
       [4675, 2244,    0,    1],
       [4675,  138,    0,    1],
       ...,
       [4354,    5,    0,    1],
       [4354,    2,    0,    1],
       [4354,    0,    0,    1]])

In [17]:
sparse_test_data.shape

(193872, 4)

Everything is okay now!

In [18]:
print("Number of Rows in training file: ", sparse_train_data.shape[0])
print("Number of Rows in test file: ", sparse_test_data.shape[0])

Number of Rows in training file:  260878
Number of Rows in test file:  193872


In [19]:
print("Number of emails in training file", np.unique(sparse_train_data[:, 0]).size)
print("Number of emails in test file", np.unique(sparse_test_data[:, 0]).size)

Number of emails in training file 4017
Number of emails in test file 1722


# From Sparse Matrix to Full Matrix

## Initialize an Empty DataFrame

In [20]:
column_names = ['DOC_ID'] + ['CATEGORY'] + list(range(0, VOCAB_SIZE))

In [21]:
column_names[:5]

['DOC_ID', 'CATEGORY', 0, 1, 2]

In [22]:
len(column_names) # should be 2500

2502

In [23]:
index_names = np.unique(sparse_train_data[:, 0])

In [24]:
index_names

array([   0,    1,    2, ..., 5791, 5794, 5795])

In [25]:
full_train_data = pd.DataFrame(index=index_names,
                               columns=column_names)

full_train_data.fillna(value=0, inplace=True) # fill all empty cells with 0

In [26]:
full_train_data.head()

Unnamed: 0,DOC_ID,CATEGORY,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,...,2460,2461,2462,2463,2464,2465,2466,2467,2468,2469,2470,2471,2472,2473,2474,2475,2476,2477,2478,2479,2480,2481,2482,2483,2484,2485,2486,2487,2488,2489,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Create Full Matrix from Sparse Matrix

In [27]:
def make_full_matrix(sparse_matrix, num_of_words, doc_idx=0, 
                     word_idx=1, cat_idx=2, freq_idx=3):
  
  '''
  Form a full matrix from a sparse matrix

  Keyword arguments:

  sparse_matrix: REQUIRED
                 numpy array sparse matrix

  num_of_words: REQUIRED
                max size of our vocab data (i.e. 2500)

  doc_idx: OPTIONAL
           position of doc_id column in sparse matrix
           Default: doc_idx=0

  word_idx: OPTIONAL
            position of word_id column in sparse matrix
            Default: word_idx=1

  cat_idx: OPTIONAL
           position of cat_id column in sparse matrix
           Default: cat_idx=2

  freq_idx: OPTIONAL
            position of occurence column in sparse matrix
            Default: freq_idx=3

  Return value: pandas DataFrame
  '''

  # first initialize an empty full matrix dataFrame 
  column_names = ['DOC_ID'] + ['CATEGORY'] + list(range(0, VOCAB_SIZE))
  doc_id_names = np.unique(sparse_matrix[:, 0])

  full_matrix = pd.DataFrame(index=doc_id_names, columns=column_names)
  full_matrix.fillna(value=0, inplace=True)


  for i in range(sparse_matrix.shape[0]): # going row by row

    doc_id = sparse_matrix[i][doc_idx]  # [row i][column doc_idx] 
    word_id = sparse_matrix[i][word_idx] # [row i][column word_idx] 
    label = sparse_matrix[i][cat_idx] # [row i][column cat_idx]
    occurence = sparse_matrix[i][freq_idx] # [row i][column freq_idx]

    # row number will correspond to doc_id
    full_matrix.at[doc_id, 'DOC_ID'] = doc_id # doc_id will go under DOC_ID column
    full_matrix.at[doc_id, 'CATEGORY'] = label # label will go under CATEGORY column
    full_matrix.at[doc_id, word_id] = occurence # occurence will go under WORD_ID column

  full_matrix.set_index('DOC_ID', inplace=True) # set index
  return full_matrix



In [28]:
%%time

full_train_data = make_full_matrix(sparse_matrix=sparse_train_data,
                                   num_of_words=VOCAB_SIZE)

CPU times: user 11.7 s, sys: 139 ms, total: 11.8 s
Wall time: 11.8 s


In [29]:
full_train_data.head()

Unnamed: 0_level_0,CATEGORY,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,...,2460,2461,2462,2463,2464,2465,2466,2467,2468,2469,2470,2471,2472,2473,2474,2475,2476,2477,2478,2479,2480,2481,2482,2483,2484,2485,2486,2487,2488,2489,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
0,1,3,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,2,0,1,2,0,0,0,0,2,0,1,0,0,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,3,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,4,0,1,1,0,1,1,0,0,0,0,0,0,0,0,3,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,8,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,2,5,2,0,0,4,0,0,0,0,3,0,0,0,0,0,6,2,2,0,0,0,2,2,4,0,2,0,2,0,0,0,0,2,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,4,0,0,0,0,1,0,0,0,0,2,2,0,0,0,4,2,0,0,2,0,0,0,2,0,2,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [30]:
full_train_data.CATEGORY[5789]

0

# Training the Naive Bayes Model 


## Calculating the Probability of Spam

In [31]:

# Calculating probability of spam

ham_count = full_train_data.CATEGORY.value_counts()[0] # total non-spam emails
spam_count = full_train_data.CATEGORY.value_counts()[1] # total spam emails
total_count = full_train_data.CATEGORY.size # total emails

# spam / (spam + non-spam)

prob_spam = spam_count / total_count

In [32]:
prob_spam

0.3116753796365447

## Total Number of Words (Tokens)

In [33]:
# select all features except CATEGORY
full_train_features = full_train_data.loc[:, full_train_data.columns != 'CATEGORY']

In [34]:
full_train_features

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,2460,2461,2462,2463,2464,2465,2466,2467,2468,2469,2470,2471,2472,2473,2474,2475,2476,2477,2478,2479,2480,2481,2482,2483,2484,2485,2486,2487,2488,2489,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
0,3,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,2,0,1,2,0,0,0,0,2,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,3,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,4,0,1,1,0,1,1,0,0,0,0,0,0,0,0,3,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,8,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2,5,2,0,0,4,0,0,0,0,3,0,0,0,0,0,6,2,2,0,0,0,2,2,4,0,2,0,2,0,0,0,0,2,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,4,0,0,0,0,1,0,0,0,0,2,2,0,0,0,4,2,0,0,2,0,0,0,2,0,2,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5789,1,0,1,1,1,0,0,0,0,1,0,0,0,0,2,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,2,1,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5790,1,1,1,0,0,1,0,0,0,0,0,0,1,2,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5791,2,1,1,1,1,3,0,0,1,4,2,0,2,1,6,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,1,0,0,0,3,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5794,4,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [35]:
# Way 1) column wise sum

full_train_features[1].sum()

total_words = 0
total_words_in_spam = 0

for i in range(VOCAB_SIZE):

  total_words += full_train_features[i].sum()

print(total_words)

440099


In [36]:
# Way 2) row wise sum

# sum across columns
emails_length = full_train_features.sum(axis=1)

emails_length.shape



(4017,)

In [38]:
emails_length[:5]

# output

'''
output

0  tokenCount or row 0
1  tokenCount or row 1
2  tokenCount or row 2
3  tokenCount or row 3
.  tokenCount or row 4
.  tokenCount or row 5
n  tokenCount or row n
'''

'\noutput\n\n0  tokenCount or row 0\n1  tokenCount or row 1\n2  tokenCount or row 2\n3  tokenCount or row 3\n.  tokenCount or row 4\n.  tokenCount or row 5\nn  tokenCount or row n\n'

In [39]:
total_words2 = emails_length.sum()

In [40]:
total_words2

440099

In [41]:
spam_emails_length = full_train_data[full_train_data.CATEGORY == 1].sum(axis=1)

In [42]:
ham_emails_length = full_train_data[full_train_data.CATEGORY == 0].sum(axis=1)

In [43]:
spam_emails_length.shape

(1252,)

In [44]:
total_words_in_spam = spam_emails_length.sum()

In [45]:
total_words_in_spam

189162

In [46]:
'''Since we already know the total number of words and 
   total number of words in spam emails, we can just subtract the two
   to get the total number of words in non-spam emails'''

total_words_in_ham = total_words - total_words_in_spam

In [47]:
total_words_in_ham

250937

In [48]:
avg_words_in_spam_emails = total_words_in_spam // spam_emails_length.shape[0]
avg_words_in_ham_emails = total_words_in_ham // ham_emails_length.shape[0]

In [49]:
print("Average number of words in spam emails: ", avg_words_in_spam_emails)
print("Average number of words in non-spam emails: ", avg_words_in_ham_emails)

Average number of words in spam emails:  151
Average number of words in non-spam emails:  90


## Summing the Tokens Occuring in Spam

In [50]:
full_train_features.shape

(4017, 2500)

In [51]:
# extract all emails that are spam messages
train_spam_tokens = full_train_features.loc[full_train_data.CATEGORY == 1]

In [None]:
train_spam_tokens.head()

In [53]:
train_spam_tokens.tail()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,2460,2461,2462,2463,2464,2465,2466,2467,2468,2469,2470,2471,2472,2473,2474,2475,2476,2477,2478,2479,2480,2481,2482,2483,2484,2485,2486,2487,2488,2489,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1884,3,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1885,3,3,0,1,1,0,0,5,0,0,0,0,1,1,0,2,0,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1887,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,2,0,0,1,0,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1890,2,0,0,0,2,0,0,3,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1895,0,0,1,0,2,1,1,0,3,2,0,3,0,0,0,2,0,1,1,2,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [54]:
train_spam_tokens.shape

(1252, 2500)

In [55]:
# Sum each column 
summed_spam_tokens = train_spam_tokens.sum(axis=0)

In [56]:
# LaPlace Smoothing 

'''

LaPlace Smoothing will prevent Zero Probability from occuring
if a word does not exist, we will add 1 so it has AT LEAST a likelihood of 1/n

'''
summed_spam_tokens += 1

In [57]:
summed_spam_tokens.head()

0    2031
1     939
2    1385
3    2078
4    1243
dtype: int64

## Summing the Tokens Occuring in Non-Spam

In [58]:
# extract all emails that are non-spam messages
train_ham_tokens = full_train_features.loc[full_train_data.CATEGORY == 0]

In [59]:
train_ham_tokens.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,2460,2461,2462,2463,2464,2465,2466,2467,2468,2469,2470,2471,2472,2473,2474,2475,2476,2477,2478,2479,2480,2481,2482,2483,2484,2485,2486,2487,2488,2489,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1896,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1898,2,0,3,1,2,4,0,1,0,2,0,0,2,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,2,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1899,0,2,0,0,0,0,0,0,2,1,3,3,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1900,0,1,0,1,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1901,1,0,0,0,0,0,1,0,0,1,0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [60]:
train_ham_tokens.shape

(2765, 2500)

In [61]:
summed_ham_tokens = train_ham_tokens.sum(axis=0)

In [62]:
summed_ham_tokens += 1

In [63]:
summed_ham_tokens.tail()

2495    18
2496    18
2497     8
2498     7
2499     2
dtype: int64

## P(Token | Spam) - Probability of Token Occuring Given that Email is Spam

In [64]:
'''

because we implemented LaPlace Smoothing by adding 1, we must also smooth our 
word counts by adding the VOCAB_SIZE. 

increase of 1 in summed tokens = increase of 1 fold of VOCAB_SIZE in total word count
'''

smoothed_spam_wordcount = total_words_in_spam + VOCAB_SIZE

In [65]:
'''
probability of word given that 
it is spam = total spam token count / total words in spam emails
'''

probs_token_spam = summed_spam_tokens / smoothed_spam_wordcount

In [66]:
probs_token_spam[:5]

0    0.010597
1    0.004899
2    0.007226
3    0.010842
4    0.006485
dtype: float64

In [67]:
# probability of all should = 1

probs_token_spam.sum()

0.9934676670388498

## P(Token | Non-Spam) - Probability of Token Occuring Given that Email is Not Spam

In [68]:
'''

because we implemented LaPlace Smoothing by adding 1, we must also smooth our 
word counts by adding the VOCAB_SIZE. 

increase of 1 in summed tokens = increase of 1 fold of VOCAB_SIZE in total word count
'''

smoothed_ham_wordcount = total_words_in_ham + VOCAB_SIZE

In [69]:
'''
probability of word given that 
it is ham = total ham token count / total words in ham emails
'''

probs_token_ham = summed_ham_tokens / smoothed_ham_wordcount

In [70]:
probs_token_ham[:5]

0    0.021623
1    0.010425
2    0.008006
3    0.003472
4    0.006270
dtype: float64

In [71]:
probs_token_ham.sum()

1.0049400837288953

## P(Token) - Probability of Token Occuring

In [72]:
prob_tokens_all = full_train_features.sum(axis=0) / total_words

In [73]:
prob_tokens_all

0       0.017062
1       0.008132
2       0.007753
3       0.006717
4       0.006430
          ...   
2495    0.000045
2496    0.000070
2497    0.000077
2498    0.000075
2499    0.000061
Length: 2500, dtype: float64

In [74]:
prob_tokens_all.head()

0    0.017062
1    0.008132
2    0.007753
3    0.006717
4    0.006430
dtype: float64

In [75]:
prob_tokens_all.sum()

1.0

In [79]:
probs_token_spam

0       0.010597
1       0.004899
2       0.007226
3       0.010842
4       0.006485
          ...   
2495    0.000021
2496    0.000078
2497    0.000146
2498    0.000146
2499    0.000141
Length: 2500, dtype: float64

0       0.021623
1       0.010425
2       0.008006
3       0.003472
4       0.006270
          ...   
2495    0.000071
2496    0.000071
2497    0.000032
2498    0.000028
2499    0.000008
Length: 2500, dtype: float64

# Save the Trained Model 

In [85]:
np.savetxt(TOKEN_SPAM_PROB_FILE,
           probs_token_spam)

np.savetxt(TOKEN_HAM_PROB_FILE,
           probs_token_ham)

np.savetxt(TOKEN_ALL_PROB_FILE,
           prob_tokens_all)

# Prepare Test Data

In [87]:
%%time

full_test_data = make_full_matrix(sparse_matrix=sparse_test_data,
                                   num_of_words=VOCAB_SIZE)

CPU times: user 7.72 s, sys: 108 ms, total: 7.83 s
Wall time: 7.75 s


In [89]:
full_test_data.head()

Unnamed: 0_level_0,CATEGORY,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,...,2460,2461,2462,2463,2464,2465,2466,2467,2468,2469,2470,2471,2472,2473,2474,2475,2476,2477,2478,2479,2480,2481,2482,2483,2484,2485,2486,2487,2488,2489,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
12,1,0,0,1,0,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
14,1,1,0,1,1,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15,1,1,1,0,1,1,0,0,1,1,1,1,1,1,0,1,1,0,1,1,1,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
17,1,0,1,1,0,1,0,0,1,1,0,0,1,0,0,0,0,1,1,1,1,1,0,1,0,1,1,0,1,1,0,0,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
19,1,1,0,1,1,0,0,1,1,1,0,0,0,1,1,0,1,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [92]:
full_test_data.tail()

Unnamed: 0_level_0,CATEGORY,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,...,2460,2461,2462,2463,2464,2465,2466,2467,2468,2469,2470,2471,2472,2473,2474,2475,2476,2477,2478,2479,2480,2481,2482,2483,2484,2485,2486,2487,2488,2489,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
5783,0,1,1,0,0,1,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
5786,0,1,0,0,0,1,0,1,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5788,0,1,1,1,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5792,0,1,1,0,0,0,0,0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5793,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [94]:
X_test = full_test_data.loc[:, full_test_data.columns != 'CATEGORY']

In [98]:
y_test = full_test_data.CATEGORY

In [99]:
y_test

DOC_ID
12      1
14      1
15      1
17      1
19      1
       ..
5783    0
5786    0
5788    0
5792    0
5793    0
Name: CATEGORY, Length: 1722, dtype: int64

In [101]:
np.savetxt(TEST_FEATURE_MATRIX,
           X_test)

np.savetxt(TEST_TARGET_MATRIX,
           y_test)