## Imports

In [53]:
import pandas as pd
import numpy as np
from scipy import sparse
import math
import timeit

## load csv into dense matrix, then converts it to sparse matrix
If "res/sparse_training.data" exists, **DO NOT** run cell as it will take awhile

In [None]:
dense_matrix = np.zeros(shape=(21, 61188), dtype=np.int16)
with open('../res/training.csv', 'r') as train_stream:
    for i, line in enumerate(train_stream):
        line_int = np.array(list(map(int, line.split(','))), dtype=np.int16)
        doc_label = line_int[-1]
        dense_matrix[doc_label] += line_int[1:-1]
        print(i)
    sparse_training = sparse.csr_matrix(dense_matrix)
    sparse.save_npz('../res/sparse_training.data',sparse_training)
    print(sparse_training)

## load the sparse matrix

In [4]:
sparse_training_data = sparse.load_npz('../res/sparse_training.data.npz')
# print(sparse_training_data[21, 3])  
print(sparse_training_data)

  (1, 0)	9
  (1, 1)	53
  (1, 2)	237
  (1, 3)	11
  (1, 4)	48
  (1, 5)	36
  (1, 6)	7
  (1, 7)	1
  (1, 8)	31
  (1, 9)	127
  (1, 10)	5
  (1, 11)	4336
  (1, 12)	19
  (1, 13)	24
  (1, 14)	46
  (1, 15)	597
  (1, 16)	267
  (1, 17)	12
  (1, 18)	9
  (1, 19)	15
  (1, 20)	2
  (1, 21)	3
  (1, 22)	2985
  (1, 23)	4
  (1, 24)	325
  :	:
  (20, 61146)	1
  (20, 61147)	1
  (20, 61148)	1
  (20, 61149)	2
  (20, 61150)	1
  (20, 61151)	1
  (20, 61152)	1
  (20, 61153)	1
  (20, 61154)	1
  (20, 61169)	2
  (20, 61170)	2
  (20, 61171)	2
  (20, 61172)	3
  (20, 61173)	4
  (20, 61174)	2
  (20, 61175)	3
  (20, 61176)	6
  (20, 61177)	1
  (20, 61178)	2
  (20, 61179)	2
  (20, 61183)	2
  (20, 61184)	2
  (20, 61185)	2
  (20, 61186)	2
  (20, 61187)	2


## Creating global vars and consts

In [5]:
UNIQUE_VOCAB = 61188
TOTAL_VOCAB = sparse_training_data.sum()
BETA = 1/TOTAL_VOCAB
ALPHA = 1 + BETA

set_list = [set() for x in range(0, 20)]
class_row_dict = dict(zip(list(range(1, 21)), set_list))

## Counting Priors and words

In [33]:
word_id_ranges = list(range(1, 61189))
column_names =  ['doc_id'] + word_id_ranges + ['label']
prior_counts = np.zeros(21, dtype=np.int16)
i = 0
for data_chunk in pd.read_csv('../res/training.csv', header=None, chunksize=200, names=column_names, usecols=['label']): 
    for _, row in data_chunk.iterrows():
        current_label = row['label']
        class_row_dict[current_label].add(i)
        i += 1

for j in range(1, 21):
    prior_counts[j] = len(class_row_dict[j])

prior_counts = prior_counts / prior_counts.sum()
print(prior_counts) 

[0.         0.04025    0.052      0.05183333 0.05358333 0.05016667
 0.0525     0.0515     0.05116667 0.05408333 0.05233333 0.05383333
 0.05325    0.05216667 0.05175    0.05308333 0.05425    0.04833333
 0.04941667 0.03891667 0.03558333]


## Naive Bayes
formula from the proj2 PDF

In [39]:
def multinomial_naive_bayes(row, row_total_words, debug_prints = False)->int:
    if debug_prints: print(row_total_words, row)
        
    map_denom = np.zeros(21, dtype = np.float64)
    for i in range(1, 21):
        map_denom[i] = sparse_training_data[i].sum() + ((ALPHA - 1) * TOTAL_VOCAB)
        
    k = ALPHA - 1
    max_prob_class = [-math.inf, -1]
    for doc_label in range(1, 21):
        running_sum = 0
        for word_i, num_words_at_i in row:
            running_sum += math.log2((sparse_training_data[doc_label, word_i] + k)/map_denom[doc_label])
        posterior = running_sum + math.log2(prior_counts[doc_label])
        if posterior > max_prob_class[0]:
            max_prob_class[0] = posterior
            max_prob_class[1] = doc_label

    if debug_prints: print(max_prob_class)
    return max_prob_class[1]

In [35]:
def bernoulli_naive_bayes(row, row_total_words, debug_prints = False)->int:
    if debug_prints: print(row_total_words, row)

    num_zeros = UNIQUE_VOCAB - len(row)
    max_prob_class = [-math.inf, -1]
    k = ALPHA - 1

    # prob_not_appearing = (0+k) / denom
    # # prob_of_doc = 1/20  # Not needed because all will be multiplied by it
    # unique_words  = 0
    for doc_label in range(1, 21):
       
        posterior = 0
        if posterior > max_prob_class[0]:
            max_prob_class[0] = posterior
            max_prob_class[1] = doc_label

    if debug_prints: print(max_prob_class)
    return max_prob_class[1]

### Training Test

In [56]:
def test_training(bayes_function):
    sparse_matrix = sparse.load_npz('../res/nb_training_data.npz')
    training_data_coo = sparse_matrix.tocoo()
    
    row_total_words = training_data_coo.A.sum(axis=1)
    correct = 0
    row = []
    for row_i, word_i, val in zip(training_data_coo.row, training_data_coo.col, training_data_coo.data):
        if word_i != 61188:
            row.append((word_i, val))
        else:
            classification = bayes_function(row, row_total_words[row_i] - val)
            correct += 1 if classification == val else 0
            row.clear()
            if not row_i % 200:
                print('At row:', row_i)
    print("Finished")
    print(f"accuracy: {(correct / row_i) * 100}%")

## Classification and Writing

In [57]:
def test_and_write(bayes_function, write_path):
    sparse_matrix = sparse.load_npz('../res/nb_testing_data.npz')
    testing_data_coo = sparse_matrix.tocoo()
    with open(write_path, 'w') as out_stream:
        out_stream.write("id,class\n")
    
        row_total_words = testing_data_coo.A.sum(axis=1)
        row_offset = 12000
        current_row = 12000
        row = []
        for row_i, word_i, num_words_at_i in zip(testing_data_coo.row + row_offset, testing_data_coo.col, testing_data_coo.data):
            if row_i == current_row:
                row.append((word_i, num_words_at_i))
            else:
                predicted_label = bayes_function(row, row_total_words[row_i - row_offset])
                out_stream.write(f'{row_i},{predicted_label}\n')
                row.clear()
                current_row = row_i
                if not current_row % 200:
                    print('At row:', current_row)
    print("File written")

## Testing
- multinomial

In [50]:
start = timeit.default_timer()

test_training(multinomial_naive_bayes)

stop = timeit.default_timer()
print('Time: ', (stop - start) / 60, 'minutes.') 

Finished
accuracy: 99.02491874322861%


In [58]:
start = timeit.default_timer()

test_and_write(multinomial_naive_bayes, '../results/multinomial_NB_results.csv')

stop = timeit.default_timer()
print('Time: ', (stop - start) / 60, 'minutes.') 

At row: 12200
At row: 12400
At row: 12600
At row: 12800
At row: 13000
At row: 13200
At row: 13400
At row: 13600
At row: 13800
At row: 14000
At row: 14200
At row: 14400
At row: 14600
At row: 14800
At row: 15000
At row: 15200
At row: 15400
At row: 15600
At row: 15800
At row: 16000
At row: 16200
At row: 16400
At row: 16600
At row: 16800
At row: 17000
At row: 17200
At row: 17400
At row: 17600
At row: 17800
At row: 18000
At row: 18200
At row: 18400
At row: 18600
File written
Time:  966.1413919999977


- Bernoulli

In [None]:
start = timeit.default_timer()

test_training(bernoulli_naive_bayes)

stop = timeit.default_timer()
print('Time: ', (stop - start) / 60, 'minutes.') 

In [None]:
start = timeit.default_timer()

test_and_write(bernoulli_naive_bayes, '../results/bernoulli_NB_results.csv')

stop = timeit.default_timer()
print('Time: ', (stop - start) / 60, 'minutes.') 

In [12]:
# with open('../res/testing.csv', 'r') as train_stream:
#     count = 0
#     for i, line in enumerate(train_stream):
#         current_line = np.array(list(map(int, line.split(','))), dtype=np.int16)
#         print(len(current_line))
#         if i == 2:
#             break
#     print(i)

61189
61189
61189
2
