## Imports

In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from scipy.sparse import csr_matrix, lil_matrix
import pickle
import math

## load csv into dense matrix, then converts it to sparse matrix
#### If "res/sparse_training.data" exists, DO NOT run cell as it will take awhile

In [None]:
dense_matrix = np.zeros(shape=(21, 61188), dtype=np.int16)
with open('../res/training.csv', 'r') as train_strem:
    i=0
    for line in train_strem:
        line_int = np.array(list(map(int, line.split(','))), dtype=np.int16)
        doc_label = line_int[-1]
        dense_matrix[doc_label] += line_int[1:-1]
        i += 1
        print(i)
    sparse_training = sparse.csr_matrix(dense_matrix)
    sparse.save_npz('../res/sparse_training.data',sparse_training)
    print(sparse_training)

## load the sparse matrix

In [7]:
sparse_training_data = sparse.load_npz('../res/sparse_training.data.npz')
print(sparse_training_data)
    

  (1, 0)	9
  (1, 1)	53
  (1, 2)	237
  (1, 3)	11
  (1, 4)	48
  (1, 5)	36
  (1, 6)	7
  (1, 7)	1
  (1, 8)	31
  (1, 9)	127
  (1, 10)	5
  (1, 11)	4336
  (1, 12)	19
  (1, 13)	24
  (1, 14)	46
  (1, 15)	597
  (1, 16)	267
  (1, 17)	12
  (1, 18)	9
  (1, 19)	15
  (1, 20)	2
  (1, 21)	3
  (1, 22)	2985
  (1, 23)	4
  (1, 24)	325
  :	:
  (20, 61146)	1
  (20, 61147)	1
  (20, 61148)	1
  (20, 61149)	2
  (20, 61150)	1
  (20, 61151)	1
  (20, 61152)	1
  (20, 61153)	1
  (20, 61154)	1
  (20, 61169)	2
  (20, 61170)	2
  (20, 61171)	2
  (20, 61172)	3
  (20, 61173)	4
  (20, 61174)	2
  (20, 61175)	3
  (20, 61176)	6
  (20, 61177)	1
  (20, 61178)	2
  (20, 61179)	2
  (20, 61183)	2
  (20, 61184)	2
  (20, 61185)	2
  (20, 61186)	2
  (20, 61187)	2


## Creating global vars

In [None]:
set_list = [set() for x in range(0, 20)]
class_row_dict = dict(zip(list(range(1, 21)), set_list))

## Counting Priors and words

In [14]:
word_id_ranges = list(range(1, 61189))
column_names =  ['doc_id'] + word_id_ranges + ['label']
prior_counts = np.zeros(21, dtype=np.int16)
i = 0
for data_chunk in pd.read_csv('../res/training.csv', header=None, chunksize=200, names=column_names, usecols=['label']): 
    for _, row in data_chunk.iterrows():
        current_label = row['label']
        class_row_dict[current_label].add(i)
        i += 1

for j in range(1, 21):
    prior_counts[j] = len(class_row_dict[j])

prior_counts = prior_counts / prior_counts.sum()
print(prior_counts) 

[0.         0.04025    0.052      0.05183333 0.05358333 0.05016667
 0.0525     0.0515     0.05116667 0.05408333 0.05233333 0.05383333
 0.05325    0.05216667 0.05175    0.05308333 0.05425    0.04833333
 0.04941667 0.03891667 0.03558333]


## Naive Bayes formula from the proj2 PDF

In [15]:
uniq_vocab = 61188
total_vocab = sparse_training_data.sum()
alpha = 1 + 1/total_vocab
denom = np.zeros(21, dtype = np.float64)
for i in range(1, 21):
    denom[i] = sparse_training_data[i].sum() + ((alpha - 1) * total_vocab)
 
do_naive_debug = False 
def do_naive(row):
    max_prob = -math.inf
    max_doc_class = -1
    non_zero_indices = row.nonzero()[0]
    for doc_label in range(1, 21):
        running_sum = 0
        for word in non_zero_indices:
            running_sum += math.log2((sparse_training_data[doc_label,word] + alpha - 1)/denom[doc_label])
        new_prob = running_sum + math.log2(prior_counts[doc_label])
        if new_prob > max_prob:
            max_prob = new_prob
            max_doc_class = doc_label
    assert(max_doc_class != -1)
    if do_naive_debug:
        print(f"{max_doc_class}: {max_prob}")
    return max_doc_class

# Testing


# With training data

In [20]:
with open('../res/training.csv', 'r') as test_stream:
    correct = 0
    
    for line in test_stream:
        test_array = np.array(list(map(int,line.split(','))))
        doc_id = test_array[0]
        doc_label = test_array[-1]
        predicted_label = do_naive(test_array[1:-1])
        if i % 125 == 0:
            print(f"i={i}; pred: {predicted_label}; true: {doc_label}")
        if predicted_label == doc_label:
            correct += 1
    print(f"accuracy: {(correct / 12000) * 100}%")

i=0; pred: 14; true: 14
i=100; pred: 16; true: 16
i=200; pred: 9; true: 9
i=300; pred: 1; true: 1
i=400; pred: 1; true: 1
i=500; pred: 6; true: 6
i=600; pred: 10; true: 10
i=700; pred: 6; true: 6
i=800; pred: 10; true: 10
i=900; pred: 3; true: 3
i=1000; pred: 4; true: 4
i=1100; pred: 11; true: 11
i=1200; pred: 16; true: 16
i=1300; pred: 6; true: 6
i=1400; pred: 9; true: 9
i=1500; pred: 8; true: 8
i=1600; pred: 7; true: 7
i=1700; pred: 1; true: 1
i=1800; pred: 10; true: 10
i=1900; pred: 5; true: 5
i=2000; pred: 15; true: 15
i=2100; pred: 11; true: 11
i=2200; pred: 9; true: 9
i=2300; pred: 12; true: 12
i=2400; pred: 11; true: 11
i=2500; pred: 20; true: 20
i=2600; pred: 1; true: 1
i=2700; pred: 2; true: 2
i=2800; pred: 16; true: 16
i=2900; pred: 5; true: 5
i=3000; pred: 2; true: 2
i=3100; pred: 1; true: 1
i=3200; pred: 2; true: 2
i=3300; pred: 16; true: 16
i=3400; pred: 19; true: 19
i=3500; pred: 4; true: 4
i=3600; pred: 4; true: 4
i=3700; pred: 1; true: 1
i=3800; pred: 13; true: 13
i=390

# With testing data

In [17]:
i = 0
with open('../res/testing.csv', 'r') as test_stream, open('../results/out.csv', 'w') as out_stream:
    out_stream.write("id,class\n")
    for line in test_stream:
        test_array = np.array(list(map(int, line.split(','))))
        doc_id = test_array[0]
        predicted_label = do_naive(test_array[1:])    
        out_stream.write(f"{doc_id},{predicted_label}\n")
print("File written")