In [3]:
# import libraries
import numpy as np
import pandas as pd

#### (c) Calculate Pi, the fraction of documents that belong to each class j

In [4]:
# read labels for train data
train_label = open('/Users/gio/Documents/DSE/2019-rgm001/DSE210/Lecture4/20news-bydate/matlab/train.label', 'r')

# define dictionary for pi_j where j is each class 1,2,..., 20
pi = {}
for i in range(1,21):
    pi[i] = 0

# count the occurrence of each class j
lines = train_label.readlines()
for line in lines:
    j_val = int(line.split()[0])
    pi[j_val] += 1

# divide each class count for the total number of documents
for j in pi.keys():
    pi[j] /= len(lines)

In [5]:
pi

{1: 0.04259472890229834,
 2: 0.05155736977549028,
 3: 0.05075871860857219,
 4: 0.05208980388676901,
 5: 0.051024935664211554,
 6: 0.052533498979501284,
 7: 0.051646108794036735,
 8: 0.052533498979501284,
 9: 0.052888455053687104,
 10: 0.0527109770165942,
 11: 0.05306593309078002,
 12: 0.0527109770165942,
 13: 0.05244475996095483,
 14: 0.0527109770165942,
 15: 0.052622237998047744,
 16: 0.05315467210932647,
 17: 0.04836276510781791,
 18: 0.05004880646020055,
 19: 0.04117490460555506,
 20: 0.033365870973467035}

#### (c) Create dataframe for training data and training labels

In [6]:
# read train data and labels
train_data = pd.read_csv('/Users/gio/Documents/DSE/2019-rgm001/DSE210/Lecture4/20news-bydate/matlab/train.data', delimiter=' ', names=['docIdx', 'wordIdx', 'count'])
train_labels = pd.read_csv('/Users/gio/Documents/DSE/2019-rgm001/DSE210/Lecture4/20news-bydate/matlab/train.label', names=['classIdx'])

# get array for documents and class indexes
docIdx = train_data['docIdx']
classIdx = train_labels['classIdx']

# match data and label size by increasing label length
new_train_labels = []
i = 0
for index in range(len(docIdx)-1):
    new_train_labels.append(classIdx[i])
    if docIdx[index] != docIdx[index+1]:
        i += 1
new_train_labels.append(classIdx[i])

# create dataframe with both train and label
df = train_data
df['classIdx'] = new_train_labels

In [7]:
df.head()

Unnamed: 0,docIdx,wordIdx,count,classIdx
0,1,1,4,1
1,1,2,2,1
2,1,3,10,1
3,1,4,4,1
4,1,5,2,1


#### (c) Use Laplace Smoothing to calculate the probability of each word per class, Pjw

In [8]:
def Prob_word_per_class(df):
    #Alpha value for smoothing
    a = 0.001

    #Calculate probability 
    pb_jw = df.groupby(['classIdx','wordIdx'])
    pb_j = df.groupby(['classIdx'])
    Pr =  (pb_jw['count'].sum() + a) / (pb_j['count'].sum() + 61188 + 1)    

    #Unstack series
    Pr = Pr.unstack()

    #Replace NaN or columns with 0 as word count with a/(count+|V|+1)
    for c in range(1,21):
        Pr.loc[c,:] = Pr.loc[c,:].fillna(a/(pb_j['count'].sum()[c] + 61188 + 1))

    #Convert to dictionary for greater speed
    return Pr.to_dict()

In [9]:
Pr_dict = Prob_word_per_class(df)
len(Pr_dict)

53975

### (d) Define Multinomial Naive Bayes

In [14]:
def Multinomial_NB(df):
  
    #Using dictionaries for greater speed
    df_dict = df.to_dict()
    new_dict = {}
    prediction = []
    
    #new_dict = {docIdx : {wordIdx: count},....}
    for idx in range(len(df_dict['docIdx'])):
        docIdx = df_dict['docIdx'][idx]
        wordIdx = df_dict['wordIdx'][idx]
        count = df_dict['count'][idx]
        try: 
            new_dict[docIdx][wordIdx] = count 
        except:
            new_dict[df_dict['docIdx'][idx]] = {}
            new_dict[docIdx][wordIdx] = count

    #Calculating the scores for each doc
    for docIdx in range(1, len(new_dict)+1):
        score_dict = {}
        #Creating a probability row for each class
        for classIdx in range(1,21):
            score_dict[classIdx] = 1
            #For each word:
            for wordIdx in new_dict[docIdx]:
               
                try:
                    probability=Pr_dict[wordIdx][classIdx]         
#                     power = np.log(1+ new_dict[docIdx][wordIdx])   
                    power = new_dict[docIdx][wordIdx]    

                    score_dict[classIdx]+=power*np.log(probability)
                except:
                    #Missing V will have log(1+0)*log(a/16689)=0 
                    score_dict[classIdx] += 0     
                    
            #Multiply final with pi         
            score_dict[classIdx] +=  np.log(pi[classIdx])                          

        #Get class with max probabilty for the given docIdx 
        max_score = max(score_dict, key=score_dict.get)
        prediction.append(max_score)
        
    return prediction

#### (e) Evaluate the performance of the model with test data

In [15]:
def predict_error(predict, labels):
    
    correct = 0
    for i,j in zip(predict, test_labels):
        if i == j:
            correct +=1
    
    perc_error = 100*(1-(correct/len(test_labels)))
    
    return round(perc_error,4)

In [16]:
test_data = pd.read_csv('/Users/gio/Documents/DSE/2019-rgm001/DSE210/Lecture4/20news-bydate/matlab/test.data', delimiter=' ', names=['docIdx', 'wordIdx', 'count'])
test_labels = pd.read_csv('/Users/gio/Documents/DSE/2019-rgm001/DSE210/Lecture4/20news-bydate/matlab/test.label', names=['classIdx'])
test_labels = test_labels.classIdx.tolist()

predict = Multinomial_NB(test_data)

In [17]:
predict_error(predict, test_labels)

22.2385