## Imports

In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from scipy.sparse import csr_matrix, lil_matrix
import pickle
import math

## Creating global containers for holding stats

In [2]:
set_list = [set() for x in range(0,20)]
class_row_dict=dict(zip(list(range(1,21)),set_list))
dense_matrix = np.zeros(shape=(21,61188), dtype=np.int16)

## This has been run before we can simply load the sparse matrix: see_below
### This loads the csv into  dense matrix and then converts it to sparse matrix and saves

In [None]:
with open('training.csv', 'r') as train_strem:
    i=0
    for line in train_strem:
        line_int = np.array(list(map(int, line.split(','))), dtype=np.int16)
        doc_label = line_int[-1]
        dense_matrix[doc_label] += line_int[1:-1]
        i += 1
        print(i)
    sparse_training = sparse.csr_matrix(dense_matrix)
    sparse.save_npz('sparse_training.data',sparse_training)
    print(sparse_training)

In [None]:
# turning the testing data into sparse_matrix

## loading the sparse matrix

In [3]:
#how load
#print(sparse_row_leader)
sparse_training_data = sparse.load_npz('sparse_training.data.npz')
print(sparse_training_data[1,11])

4336


## Counting Priors and words

In [4]:
word_id_ranges = list(range(1,61189))
column_names =  ['doc_id'] + word_id_ranges + ['label']
prior_counts = np.zeros(21, dtype=np.int16)
#for data_chunk in pd.read_csv('training.csv', header=None, chunksize=200, names=column_names, usecols=['label']):
i = 0
for data_chunk in pd.read_csv('training.csv', header=None, chunksize=200, names=column_names, usecols=['label']): 
    for _, row in data_chunk.iterrows():
        current_label = row['label']
        class_row_dict[current_label].add(i)
        i += 1
for j in range(1,21):
    prior_counts[j] = len(class_row_dict[j])
prior_counts = prior_counts / prior_counts.sum()
print(prior_counts) 

[0.         0.04025    0.052      0.05183333 0.05358333 0.05016667
 0.0525     0.0515     0.05116667 0.05408333 0.05233333 0.05383333
 0.05325    0.05216667 0.05175    0.05308333 0.05425    0.04833333
 0.04941667 0.03891667 0.03558333]


In [None]:
#This cell implements the actual naive bayse formula from the PDF

uniq_vocab = 61188
total_vocab = sparse_training_data.sum()
alpha = 1 + 1/total_vocab
denom = np.zeros(21, dtype = np.float64)
for i in range(1,21):
    denom[i] = sparse_training_data[i].sum() + ((alpha - 1) * total_vocab)
 
do_naive_debug = False 
def do_naive(row):
    max_prob = -math.inf
    max_doc_class = -1
    non_zero_indices = row.nonzero()[0]
    for doc_label in range(1,21):
        running_sum = 0
        for word in non_zero_indices:
            running_sum += math.log2((sparse_training_data[doc_label,word] + alpha - 1)/denom[doc_label])
        new_prob = running_sum + math.log2(prior_counts[doc_label])
        if new_prob > max_prob:
            max_prob = new_prob
            max_doc_class = doc_label
    assert(max_doc_class != -1)
    if do_naive_debug:
        print(f"{max_doc_class}: {max_prob}")
    return max_doc_class

In [None]:
with open('training.csv', 'r') as test_stream:
    i = 1
    for line in test_stream:
        test_array = np.array(list(map(int,line.split(','))))
        doc_id = test_array[0]
        doc_label =test_array[-1]
        predicted_label = do_naive(test_array[1:-1])
        print(f"{doc_label}:{predicted_label}")
        i += 1
        if i > 10:
            break

In [None]:
#actual testing:
i = 0
with open('testing.csv', 'r') as test_stream:
    with open('out.csv', 'w') as out_stream:
        out_stream.write("id,class\n")
        for line in test_stream:
            test_array = np.array(list(map(int,line.split(','))))
            doc_id =test_array[0]
            predicted_label = do_naive(test_array[1:])    
            out_stream.write(f"{doc_id},{predicted_label}\n")

## The cells below are for Experiments only DO NOT RUN THEY MIGHT BLOW UP YOUR LAPTOP

In [None]:
for doc_cls_id, row_set in class_row_dict.items():
    print(f"class {doc_cls_id} starting")
    row_leader = row_set.pop()
    sparse_row_leader[doc_cls_id] = row_leader
    for other_row in row_set:
        sparse_data[row_leader] += sparse_data[other_row]

In [None]:
#building a sparse matrix
with open('training.csv', 'r') as train_strem:
    i = 0
    for line in train_strem:
        line_int = np.array(list(map(int, line.split(','))), dtype=np.int16)
        doc_label = line_int[-1]
        row_leader = sparse_row_leader[doc_label]
        # print(row_leader)
        if row_leader == -1:
            sparse_row_leader[doc_label] = i
            sparse_data[i,:] = line_int[1:-1]
        else:
            sparse_data[row_leader] += line_int[1:-1] 

        # sparse_data[i,:] = list(map(int, line.split(',')))[1:-1]
        i+=1
        print(i)
#saving the sparse matrix to file
sparse_data = sparse_data.tocsr()
sparse.save_npz('sparse.data',sparse_data)
with open('row_leader.obj', 'wb') as picke_stream:
    pickle.dump(row_leader, picke_stream)

In [None]:
# I use index 1-> 61188
count_data=[{'label':x, 'counts':np.zeros(shape=(61189), dtype=np.int32)} for x in range(1,21)]
count_df = pd.DataFrame(count_data, index=list(range(1,21)))
print("this is what the df looks like")
print(count_df.head())
print('This is how we index!')
print(count_df.loc[1,'label'])
print('accessing the numpuy array')
print(type(count_df.loc[1,'counts'][0]))

In [None]:
# DO NOT RUN THIS CELL BUT KEEP IT

word_id_ranges = list(range(1,61189))
column_names =  ['doc_id'] + word_id_ranges + ['label']
prior_counts = [0] * 21
#for data_chunk in pd.read_csv('training.csv', header=None, chunksize=200, names=column_names, usecols=['label']):
for data_chunk in pd.read_csv('training.csv', header=None, chunksize=200, names=column_names, usecols=['label']): 
    for _, row in data_chunk.iterrows():
        #what document label we are dealing with!?
        current_label = row['label']
        # So we have seen one more of this type!
        prior_counts [current_label] += 1

print(prior_counts)