In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json, operator
%matplotlib inline

###Class Distribution

#### Calculate fraction of documents in each class

In [4]:
#Training label
train_label = open('/home/sadat/Downloads/HW2_210/20news-bydate/matlab/train.label')

#pi is the fraction of each class
pi = {}

#Set a class index for each document as key
for i in range(1,21):
    pi[i] = 0
    
#Extract values from training labels
lines = train_label.readlines()

#Get total number of documents
total = len(lines)

#Count the occurence of each class
for line in lines:
    val = int(line.split()[0])
    pi[val] += 1

#Divide the count of each class by total documents 
for key in pi:
    pi[key] /= total
    
print("Probability of each class:")
print("\n".join("{}: {}".format(k, v) for k, v in pi.items()))

Probability of each class:
1: 0.04259472890229834
2: 0.05155736977549028
3: 0.05075871860857219
4: 0.05208980388676901
5: 0.051024935664211554
6: 0.052533498979501284
7: 0.051646108794036735
8: 0.052533498979501284
9: 0.052888455053687104
10: 0.0527109770165942
11: 0.05306593309078002
12: 0.0527109770165942
13: 0.05244475996095483
14: 0.0527109770165942
15: 0.052622237998047744
16: 0.05315467210932647
17: 0.04836276510781791
18: 0.05004880646020055
19: 0.04117490460555506
20: 0.033365870973467035


In [5]:
#Check if sum of the probabilities is 1
np.sum(list(pi.values()))

1.0

###Probability Distribution over V

####Dataframe

In [6]:
#Training data
train_data = open('/home/sadat/Downloads/HW2_210/20news-bydate/matlab/train.data')
df = pd.read_csv(train_data, delimiter=' ', names=['docIdx', 'wordIdx', 'count'])

#Training label
label = []
train_label = open('/home/sadat/Downloads/HW2_210/20news-bydate/matlab/train.label')
lines = train_label.readlines()
for line in lines:
    label.append(int(line.split()[0]))

#Increase label length to match docIdx
docIdx = df['docIdx'].values
i = 0
new_label = []
for index in range(len(docIdx)-1):
    new_label.append(label[i])
    if docIdx[index] != docIdx[index+1]:
        i += 1
new_label.append(label[i]) #for-loop ignores last value

#Add label column
df['label'] = new_label

df.head()

Unnamed: 0,docIdx,wordIdx,count,label
0,1,1,4,1
1,1,2,2,1
2,1,3,10,1
3,1,4,4,1
4,1,5,2,1


####Probability of each word per class

For calculating our probability, we will find the average of each word for a given class.

For class j and word i, the average is given by:

$$P(i|j) = \frac{count_{ij}}{count_j}$$


However, since some words will have 0 counts, we will perform a Laplace Smoothing:



$$ P(i|j) = log(\frac{count_{ij}+1}{count_j+|V|})$$

In [7]:
#Calculate probability of each word based on class
pb_ij = df.groupby(['label','wordIdx'])
pb_j = df.groupby(['label'])
Pr =  (pb_ij['count'].sum() + 1) / (pb_j['count'].sum() + 16689)    

#Unstack series
Pr = Pr.unstack()

#Replace NaN or columns with 0 as word count with 1/(|V|+1)
Pr = Pr.fillna(1/16689)
Pr

wordIdx,1,2,3,4,5,6,7,8,9,10,...,53966,53967,53968,53969,53970,53971,53972,53973,53974,53975
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,8.5e-05,0.000387,0.001668,6e-05,0.000502,0.000254,4.2e-05,1.2e-05,0.000211,0.000852,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
2,0.00048,0.000472,6e-05,0.000142,0.000118,0.000464,8.7e-05,5.5e-05,0.001362,3.1e-05,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
3,0.000112,0.000651,6e-05,0.000168,0.000205,0.000326,2.8e-05,2.8e-05,0.001349,6e-05,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
4,7.8e-05,0.000276,6e-05,6e-05,9.5e-05,0.000423,2.6e-05,1.7e-05,0.000423,6e-05,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
5,6.8e-05,0.00033,6e-05,1.9e-05,1.9e-05,0.000467,1.9e-05,6e-05,0.000467,6e-05,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
6,0.000283,0.001315,6e-05,0.000472,9.4e-05,0.000313,0.00013,2.4e-05,0.001404,6e-05,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
7,6e-05,0.000373,6e-05,3.9e-05,3.9e-05,0.000424,6e-05,6e-05,0.000373,5.1e-05,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
8,7.6e-05,0.000421,6e-05,6e-05,0.000107,0.000665,6.1e-05,3.1e-05,0.000145,6e-05,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
9,0.000126,0.00057,6e-05,4.2e-05,4.2e-05,0.000704,3.4e-05,1.7e-05,4.2e-05,6e-05,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
10,1.6e-05,0.000273,6e-05,2.4e-05,1.6e-05,0.002432,1.6e-05,6e-05,2.4e-05,6e-05,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05


###Multinomial Naive Bayes Classifier

Combining probability distribution of P with fraction of documents belonging to each class

$$Pr(i|j)=\pi_j * P_{ij} $$

In [8]:
for i in range(1,21):
    Pr[:][i]=Pr[:][1]*pi[i]
    
Pr

wordIdx,1,2,3,4,5,6,7,8,9,10,...,53966,53967,53968,53969,53970,53971,53972,53973,53974,53975
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.603158e-06,1.857693e-07,1.828917e-07,1.876878e-07,1.838509e-07,1.892865e-07,1.860891e-07,1.892865e-07,1.905654e-07,1.89926e-07,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
2,2.045132e-05,1.054416e-06,1.038083e-06,1.065305e-06,1.043527e-06,1.074379e-06,1.056231e-06,1.074379e-06,1.081639e-06,1.078009e-06,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
3,4.756707e-06,2.452433e-07,2.414444e-07,2.47776e-07,2.427107e-07,2.498865e-07,2.456654e-07,2.498865e-07,2.515749e-07,2.507307e-07,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
4,3.309471e-06,1.706276e-07,1.679845e-07,1.723897e-07,1.688655e-07,1.738581e-07,1.709213e-07,1.738581e-07,1.750328e-07,1.744454e-07,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
5,2.898192e-06,1.494232e-07,1.471085e-07,1.509663e-07,1.478801e-07,1.522522e-07,1.496803e-07,1.522522e-07,1.532809e-07,1.527665e-07,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
6,1.205973e-05,6.217682e-07,6.121366e-07,6.281892e-07,6.153471e-07,6.3354e-07,6.228383e-07,6.3354e-07,6.378207e-07,6.356804e-07,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
7,2.552264e-06,1.31588e-07,1.295496e-07,1.329469e-07,1.302291e-07,1.340793e-07,1.318145e-07,1.340793e-07,1.349853e-07,1.345323e-07,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
8,3.256702e-06,1.67907e-07,1.65306e-07,1.69641e-07,1.66173e-07,1.710859e-07,1.68196e-07,1.710859e-07,1.722419e-07,1.716639e-07,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
9,5.354684e-06,2.760734e-07,2.717969e-07,2.789245e-07,2.732224e-07,2.813003e-07,2.765486e-07,2.813003e-07,2.83201e-07,2.822506e-07,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
10,6.837749e-07,3.525363e-08,3.470754e-08,3.56177e-08,3.488957e-08,3.592109e-08,3.531431e-08,3.592109e-08,3.61638e-08,3.604244e-08,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05


#### Generating function

In [36]:
def MNB(df):
    doc = df.groupby(['docIdx','wordIdx'])
    doc['count'].appy(lambda x[i][j]: Pr[i][j] * np.log(x[i][j] + 1))

SyntaxError: invalid syntax (<ipython-input-36-85fb829fdb0e>, line 3)

In [35]:
a = MNB(df)

AttributeError: 'SeriesGroupBy' object has no attribute 'appy'