In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json, operator
%matplotlib inline

###Class Distribution

#### Calculate fraction of documents in each class

$$\pi_j = \frac{class_{j}}{\sum\limits_{j=1}^{20} class_{j} }$$

In [2]:
#Training label
train_label = open('/home/sadat/Downloads/HW2_210/20news-bydate/matlab/train.label')

#pi is the fraction of each class
pi = {}

#Set a class index for each document as key
for i in range(1,21):
    pi[i] = 0
    
#Extract values from training labels
lines = train_label.readlines()

#Get total number of documents
total = len(lines)

#Count the occurence of each class
for line in lines:
    val = int(line.split()[0])
    pi[val] += 1

#Divide the count of each class by total documents 
for key in pi:
    pi[key] /= total
    
print("Probability of each class:")
print("\n".join("{}: {}".format(k, v) for k, v in pi.items()))

Probability of each class:
1: 0.04259472890229834
2: 0.05155736977549028
3: 0.05075871860857219
4: 0.05208980388676901
5: 0.051024935664211554
6: 0.052533498979501284
7: 0.051646108794036735
8: 0.052533498979501284
9: 0.052888455053687104
10: 0.0527109770165942
11: 0.05306593309078002
12: 0.0527109770165942
13: 0.05244475996095483
14: 0.0527109770165942
15: 0.052622237998047744
16: 0.05315467210932647
17: 0.04836276510781791
18: 0.05004880646020055
19: 0.04117490460555506
20: 0.033365870973467035


In [3]:
#Check if sum of the probabilities is 1
np.sum(list(pi.values()))

1.0

###Probability Distribution over V

####Dataframe

In [4]:
#Training data
train_data = open('/home/sadat/Downloads/HW2_210/20news-bydate/matlab/train.data')
df = pd.read_csv(train_data, delimiter=' ', names=['docIdx', 'wordIdx', 'count'])

#Training label
label = []
train_label = open('/home/sadat/Downloads/HW2_210/20news-bydate/matlab/train.label')
lines = train_label.readlines()
for line in lines:
    label.append(int(line.split()[0]))

#Increase label length to match docIdx
docIdx = df['docIdx'].values
i = 0
new_label = []
for index in range(len(docIdx)-1):
    new_label.append(label[i])
    if docIdx[index] != docIdx[index+1]:
        i += 1
new_label.append(label[i]) #for-loop ignores last value

#Add label column
df['classIdx'] = new_label

df.head()

Unnamed: 0,docIdx,wordIdx,count,classIdx
0,1,1,4,1
1,1,2,2,1
2,1,3,10,1
3,1,4,4,1
4,1,5,2,1


####Probability of each word per class

For calculating our probability, we will find the average of each word for a given class.

For class j and word i, the average is given by:

$$P(i|j) = \frac{word_{ij}}{word_j}$$


However, since some words will have 0 counts, we will perform a Laplace Smoothing:



$$ P(i|j) = log(\frac{word_{ij}+1}{word_j+|V|})$$

where $V$ is an array of all the words in the vocabulary

In [5]:
#Calculate probability of each word based on class
pb_ij = df.groupby(['classIdx','wordIdx'])
pb_j = df.groupby(['classIdx'])
Pr =  (pb_ij['count'].sum() + 1) / (pb_j['count'].sum() + 16689)    

#Unstack series
Pr = Pr.unstack()

#Replace NaN or columns with 0 as word count with 1/(|V|+1)
Pr = Pr.fillna(1/16689)
Pr

wordIdx,1,2,3,4,5,6,7,8,9,10,...,53966,53967,53968,53969,53970,53971,53972,53973,53974,53975
classIdx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,8.5e-05,0.000387,0.001668,6e-05,0.000502,0.000254,4.2e-05,1.2e-05,0.000211,0.000852,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
2,0.00048,0.000472,6e-05,0.000142,0.000118,0.000464,8.7e-05,5.5e-05,0.001362,3.1e-05,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
3,0.000112,0.000651,6e-05,0.000168,0.000205,0.000326,2.8e-05,2.8e-05,0.001349,6e-05,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
4,7.8e-05,0.000276,6e-05,6e-05,9.5e-05,0.000423,2.6e-05,1.7e-05,0.000423,6e-05,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
5,6.8e-05,0.00033,6e-05,1.9e-05,1.9e-05,0.000467,1.9e-05,6e-05,0.000467,6e-05,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
6,0.000283,0.001315,6e-05,0.000472,9.4e-05,0.000313,0.00013,2.4e-05,0.001404,6e-05,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
7,6e-05,0.000373,6e-05,3.9e-05,3.9e-05,0.000424,6e-05,6e-05,0.000373,5.1e-05,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
8,7.6e-05,0.000421,6e-05,6e-05,0.000107,0.000665,6.1e-05,3.1e-05,0.000145,6e-05,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
9,0.000126,0.00057,6e-05,4.2e-05,4.2e-05,0.000704,3.4e-05,1.7e-05,4.2e-05,6e-05,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05
10,1.6e-05,0.000273,6e-05,2.4e-05,1.6e-05,0.002432,1.6e-05,6e-05,2.4e-05,6e-05,...,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05,6e-05


###Multinomial Naive Bayes Classifier

Combining probability distribution of P with fraction of documents belonging to each class (for class j, word i at a word frequency of f)

$$Pr(j) = \pi_j \prod\limits_{i=1}^n Pr(i|j)^f$$

One issue is that, if a word appears again, the probability of it appearing again goes up. In order to smooth this, we take the log of the frequency

$$Pr(j) = \pi_j \prod\limits_{i=1}^n Pr(i|j)log(1+f)$$


In [7]:
#Convert to dictionary for greater speed
Pr_dict = Pr.to_dict()

#### Generating function

In [13]:
def MNB(df, log = False):
    '''
    Multinomial Naive Bayes classifier
    :param df [Pandas Dataframe]: Dataframe of data
    :param log [bool]: Apply frequency smoothing if True
    :return predict [list]: Predicted class ID
    '''
    #Avoid damaging the original df. Use deep copy to avoid new_df pointing at df.
    new_df = df.copy(deep=True)
    
    #Creating a probability row for each class
    for i in range(1,21):
        #Check for frequency smoothing
        if log:
            new_df[i] = new_df['wordIdx'].apply(lambda x: Pr_dict[x][i])  
            new_df[i] *= new_df['count'].apply(lambda x: np.log(x+1))
            new_df[i] *= pi[i]
        else:
            new_df[i] = new_df['wordIdx'].apply(lambda x: Pr_dict[x][i]).pow(new_df['count'])
            new_df[i] *= pi[i]
            
    #Drop count column to increase speed of cumsum. Remove classIdx if available
    try: 
        new_df.drop(['count','classIdx'], axis=1, inplace=True)
    except:
        new_df.drop('count', axis=1, inplace=True)
        
    #Changing index (similar to groupby) for cumsum 
    classes = [int(i) for i in range(1,21)]
    predict_prob = new_df.reset_index()
    predict_prob = predict_prob.set_index(['docIdx', 'wordIdx', 'index'])
    
    #cumsum needs the dataframe size to remain the same
    predict_prob = predict_prob.groupby(level=[0,1,2]).sum().groupby(level=[0,1]).cumsum()
    
    #Get class ID with the largest probability
    predict_class = preict_prob.idxmax(axis=1)
    return predict_class.values

In [None]:
MNB(df)