## Imports

In [3]:
import pandas as pd
import numpy as np
from scipy import sparse
import math

## load csv into dense matrix, then converts it to sparse matrix
If "res/sparse_training.data" exists, **DO NOT** run cell as it will take awhile

In [None]:
dense_matrix = np.zeros(shape=(21, 61188), dtype=np.int16)
with open('../res/training.csv', 'r') as train_stream:
    for i, line in enumerate(train_stream):
        line_int = np.array(list(map(int, line.split(','))), dtype=np.int16)
        doc_label = line_int[-1]
        dense_matrix[doc_label] += line_int[1:-1]
        print(i)
    sparse_training = sparse.csr_matrix(dense_matrix)
    sparse.save_npz('../res/sparse_training.data',sparse_training)
    print(sparse_training)

## load the sparse matrix

In [4]:
sparse_training_data = sparse.load_npz('../res/sparse_training.data.npz')
# print(sparse_training_data[21, 3])  
print(sparse_training_data)

  (1, 0)	9
  (1, 1)	53
  (1, 2)	237
  (1, 3)	11
  (1, 4)	48
  (1, 5)	36
  (1, 6)	7
  (1, 7)	1
  (1, 8)	31
  (1, 9)	127
  (1, 10)	5
  (1, 11)	4336
  (1, 12)	19
  (1, 13)	24
  (1, 14)	46
  (1, 15)	597
  (1, 16)	267
  (1, 17)	12
  (1, 18)	9
  (1, 19)	15
  (1, 20)	2
  (1, 21)	3
  (1, 22)	2985
  (1, 23)	4
  (1, 24)	325
  :	:
  (20, 61146)	1
  (20, 61147)	1
  (20, 61148)	1
  (20, 61149)	2
  (20, 61150)	1
  (20, 61151)	1
  (20, 61152)	1
  (20, 61153)	1
  (20, 61154)	1
  (20, 61169)	2
  (20, 61170)	2
  (20, 61171)	2
  (20, 61172)	3
  (20, 61173)	4
  (20, 61174)	2
  (20, 61175)	3
  (20, 61176)	6
  (20, 61177)	1
  (20, 61178)	2
  (20, 61179)	2
  (20, 61183)	2
  (20, 61184)	2
  (20, 61185)	2
  (20, 61186)	2
  (20, 61187)	2


## Creating global vars and consts

In [5]:
UNIQUE_VOCAB = 61188
TOTAL_VOCAB = sparse_training_data.sum()
BETA = 1/TOTAL_VOCAB
ALPHA = 1 + BETA

set_list = [set() for x in range(0, 20)]
class_row_dict = dict(zip(list(range(1, 21)), set_list))

## Counting Priors and words

In [None]:
word_id_ranges = list(range(1, 61189))
column_names =  ['doc_id'] + word_id_ranges + ['label']
prior_counts = np.zeros(21, dtype=np.int16)
i = 0
for data_chunk in pd.read_csv('../res/training.csv', header=None, chunksize=200, names=column_names, usecols=['label']): 
    for _, row in data_chunk.iterrows():
        current_label = row['label']
        class_row_dict[current_label].add(i)
        i += 1

for j in range(1, 21):
    prior_counts[j] = len(class_row_dict[j])

prior_counts = prior_counts / prior_counts.sum()
print(prior_counts) 

## Naive Bayes
formula from the proj2 PDF

In [None]:
map_denom = np.zeros(21, dtype = np.float64)
for i in range(1, 21):
    map_denom[i] = sparse_training_data[i].sum() + ((ALPHA - 1) * TOTAL_VOCAB)

do_naive_debug = False 
def multinomial_naive_bayes(row):
    max_prob = -math.inf
    max_doc_class = -1
    non_zero_indices = row.nonzero()[0]
    for doc_label in range(1, 21):
        running_sum = 0
        for word in non_zero_indices:
            running_sum += math.log2((sparse_training_data[doc_label, word] + ALPHA - 1)/map_denom[doc_label])
        new_prob = running_sum + math.log2(prior_counts[doc_label])
        if new_prob > max_prob:
            max_prob = new_prob
            max_doc_class = doc_label
    assert(max_doc_class != -1)
    if do_naive_debug:
        print(f"{max_doc_class}: {max_prob}")
    return max_doc_class

In [8]:
def bernoulli_naive_bayes(row, row_total_words)->int:
    print(row_total_words, row)
    # num_zeros = UNIQUE_VOCAB - len(row)
    # k = np.zeros(21, dtype=np.int16) # priors
    # for j in range(1, 21):
    #     k[j] = len(class_row_dict[j])
    # 
    # denom = len(row)
    # prob_not_appearing = (0+k) / denom
    # # prob_of_doc = 1/20  # Not needed because all will be multiplied by it
    # unique_words  = 0
    # cx = 0
    # for doc_label in range(1, 21):
    #     if new_prob > max_prob:
    #         max_prob = new_prob
    #         max_doc_class = doc_label
    return 1

# Testing
## With training data
### multinomial

In [None]:
with open('../res/training.csv', 'r') as test_stream:
    correct = 0

    for line in test_stream:
        test_array = np.array(list(map(int, line.split(','))))
        doc_id = test_array[0]
        doc_label = test_array[-1]
        predicted_label = multinomial_naive_bayes(test_array[1:-1])
        if i % 125 == 0:
            print(f"i={i}; pred={predicted_label}; true={doc_label}")
        if predicted_label == doc_label:
            correct += 1
    print(f"accuracy: {(correct / 12000) * 100}%")

### Bernoulli

In [15]:
testing_data_sparse = sparse.load_npz('../res/nb_training_data.npz')
testing_data_coo = testing_data_sparse.tocoo()

row_total_words = testing_data_coo.A.sum(axis=1)

correct = 0
row = []
for row_i, word_i, val in zip(testing_data_coo.row, testing_data_coo.col, testing_data_coo.data):
    if row_i == 2:
        break
    if word_i != 61188:
        row.append((word_i, val))
    else:
        classification = bernoulli_naive_bayes(row, row_total_words[row_i] - val)
        correct += 1 if classification == val else 1
        row.clear()
print("Finished")

180 [(11, 5), (22, 2), (28, 13), (29, 4), (32, 10), (41, 5), (47, 2), (50, 1), (59, 4), (71, 1), (80, 4), (101, 1), (103, 5), (115, 1), (121, 1), (143, 1), (232, 1), (238, 3), (250, 3), (303, 2), (354, 1), (405, 1), (418, 1), (455, 1), (464, 1), (465, 2), (466, 2), (472, 2), (475, 1), (483, 1), (490, 1), (491, 1), (535, 1), (573, 2), (574, 1), (597, 1), (660, 1), (687, 1), (707, 1), (708, 3), (743, 1), (747, 1), (748, 3), (760, 1), (766, 1), (788, 2), (827, 1), (849, 1), (887, 1), (921, 1), (968, 1), (1002, 1), (1027, 1), (1028, 1), (1029, 1), (1035, 1), (1041, 1), (1055, 1), (1076, 1), (1088, 2), (1127, 1), (1174, 1), (1241, 1), (1434, 3), (1557, 1), (1558, 1), (1702, 1), (1820, 1), (1985, 1), (2184, 1), (2242, 1), (2578, 1), (2580, 1), (2713, 1), (3129, 1), (3148, 1), (3215, 1), (3456, 1), (4288, 1), (4312, 1), (4586, 1), (4756, 1), (4843, 1), (5372, 1), (5958, 1), (6142, 1), (6246, 1), (6480, 1), (7254, 1), (8374, 1), (9069, 1), (9514, 1), (9554, 1), (11282, 1), (11460, 1), (11903, 

## With testing data
### multinomial

In [None]:
i = 0

with open('../res/testing.csv', 'r') as test_stream, open('../results/out.csv', 'w') as out_stream:
    out_stream.write("id,class\n")
    for line in test_stream:
        test_array = np.array(list(map(int, line.split(','))))
        doc_id = test_array[0]
        predicted_label = multinomial_naive_bayes(test_array[1:])    
        out_stream.write(f"{doc_id},{predicted_label}\n")

### Bernoulli

In [14]:
testing_data_sparse = sparse.load_npz('../res/nb_testing_data.npz')
testing_data_coo = testing_data_sparse.tocoo()

with open('../results/bernoulli_NB_results.csv', 'w') as out_stream:
    out_stream.write("id,class\n")

    row_total_words = testing_data_coo.A.sum(axis=1)
    row_offset = 12001
    current_row = 12001
    row = []
    for row_i, word_i, num_words_at_i in zip(testing_data_coo.row + row_offset, testing_data_coo.col, testing_data_coo.data):
        if row_i == row_offset+3:
            break
        if row_i == current_row:
            row.append((word_i, num_words_at_i))
        else:
            predicted_label = bernoulli_naive_bayes(row, row_total_words[row_i - row_offset])
            out_stream.write(f'{row_i},{predicted_label}\n')
            row.clear()
            current_row = row_i
print("File written")

94 [(11, 4), (22, 5), (26, 1), (28, 13), (29, 5), (32, 3), (41, 1), (43, 1), (47, 2), (51, 1), (72, 1), (80, 2), (82, 2), (83, 2), (130, 1), (134, 1), (135, 1), (138, 2), (143, 1), (159, 2), (232, 3), (298, 1), (300, 4), (311, 1), (339, 2), (360, 1), (387, 1), (455, 1), (472, 1), (473, 3), (629, 1), (643, 8), (720, 1), (721, 1), (765, 2), (766, 1), (769, 1), (774, 2), (777, 2), (788, 1), (807, 1), (813, 1), (827, 1), (831, 1), (849, 2), (858, 1), (862, 1), (876, 1), (886, 1), (906, 1), (909, 1), (911, 1), (921, 2), (929, 2), (941, 1), (994, 3), (1032, 1), (1035, 1), (1224, 1), (1234, 1), (1409, 1), (1444, 1), (1543, 1), (1570, 2), (1640, 1), (1782, 1), (1917, 1), (1952, 1), (2122, 1), (2378, 1), (2382, 1), (2523, 1), (2538, 1), (2891, 1), (2955, 1), (3234, 1), (3244, 1), (3386, 1), (3687, 1), (4049, 2), (4091, 1), (4819, 1), (5327, 1), (5485, 1), (5534, 1), (5944, 1), (5960, 1), (6320, 1), (6478, 1), (6744, 1), (6922, 1), (6976, 2), (6997, 1), (8146, 1), (8428, 1), (8647, 1), (9030, 1)

In [12]:
with open('../res/testing.csv', 'r') as train_stream:
    count = 0
    for i, line in enumerate(train_stream):
        current_line = np.array(list(map(int, line.split(','))), dtype=np.int16)
        print(len(current_line))
        if i == 2:
            break
    print(i)

61189
61189
61189
2
