In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json, operator
%matplotlib inline

###Class Distribution

#### Calculate fraction of documents in each class

In [5]:
#Training label
train_label = open('/home/sadat/Downloads/HW2_210/20news-bydate/matlab/train.label')

#pi is the fraction of each class
pi = {}

#Set a class index for each document as key
for i in range(1,21):
    pi[i] = 0
    
#Extract values from training labels
lines = train_label.readlines()

#Get total number of documents
total = len(lines)

#Count the occurence of each class
for line in lines:
    val = int(line.split()[0])
    pi[val] += 1

#Divide the count of each class by total documents 
for key in pi:
    pi[key] /= total
    
print('Created dictionary with', len(pi), 'classes')

Created dictionary with 20 classes


###Probability Distribution over V

####Average count of each word per class

In [6]:
#Training data
train_data = open('/home/sadat/Downloads/HW2_210/20news-bydate/matlab/train.data')
df = pd.read_csv(train_data, delimiter=' ', names=['docIdx', 'wordIdx', 'count'])

#Training label
label = []
train_label = open('/home/sadat/Downloads/HW2_210/20news-bydate/matlab/train.label')
lines = train_label.readlines()
for line in lines:
    label.append(int(line.split()[0]))
    
#Increase label length to match docIdx
docIdx = df['docIdx'].values
i = 0
new_label = []
for index in range(len(docIdx)-1):
    new_label.append(label[i])
    if docIdx[index] != docIdx[index+1]:
        i += 1
new_label.append(label[i]) #for-loop ignores last value

#Add label column
df['label'] = new_label

df.head()

Unnamed: 0,docIdx,wordIdx,count,label
0,1,1,4,1
1,1,2,2,1
2,1,3,10,1
3,1,4,4,1
4,1,5,2,1


####Inverse Document Frequency

Since the calculation using all 61188 words it taking a long time to run, we will pick the top 100 words using IDF.

In [28]:
idf = {}
i = 0
totDoc = len(df['docIdx'].unique())
for wordIdx in range(1,61189):
    i += 1
    print(i,end='\r')
    val = len(df[df['wordIdx'] == wordIdx])
    idf[wordIdx] = np.log(val/totDoc)

print('TF-IDF complete!')

TF-IDF complete!


In [45]:
#Sort idf dictionary in descending order
sorted_idf = sorted(idf.items(), key=operator.itemgetter(1), reverse=True)

#Pick top 1000 words
sorted_idf = sorted_idf[:100]

#Store top words in imp_words list
imp_words = []
for word in sorted_idf:
    imp_words.append(word[0])

####Average count of each word per class

In [46]:
P = {}
i = 0

for label in range(1,21):
    wordList = []
    i += 1
    print(i,end='\r')
    for wordIdx in imp_words:
        count = 0
    
        #Count the word frequency in class
        cond1 = (df['label']==label) & (df['wordIdx']==wordIdx)
        count = np.nansum(df[cond1]['count'])
        
        #Count the total words in class
        cond2 = (df['label']==label)
        totDoc = np.nansum(df[cond2]['count'])
        
        #Append probability to wordList with Laplace Smoothing
        wordList.append((count+1)/(totDoc+2))
    
    P[label]=wordList

P



{1: [0.048140631929791552,
  0.028048436303036006,
  0.018062816670474553,
  0.018331608585213755,
  0.027436934697004315,
  0.023337857997231443,
  0.015374897523082506,
  0.021106885104896046,
  0.0078823228997271753,
  0.0081511148144663807,
  0.0053825580926525731,
  0.0078084051231738947,
  0.013110325641404706,
  0.0087222976332871904,
  0.0046165011356458395,
  0.010650879621540984,
  0.0086819788460763096,
  0.0057790261668929001,
  0.006209093230475627,
  0.0037899659978227855,
  0.0059537409114733828,
  0.0047576168908839223,
  0.0068340344322442778,
  0.0042939508379587939,
  0.0027551171260768478,
  0.0030843872216323731,
  0.0037563670084803848,
  0.005133925571518809,
  0.0043275498273011945,
  0.0049457712312013652,
  0.0028424744983670891,
  0.0036690096361901435,
  0.0050600077949655275,
  0.0046635397207252004,
  0.0033867781257139787,
  0.0048852930503850448,
  0.004549303156961039,
  0.0034472563065302995,
  0.0042805112422218336,
  0.0042536320507479139,
  0.002439

###Final Classifier

####Combining probability distribution of P with fraction of documents belonging to each class

In [48]:
classifier = {}
for label in range(1,21):
    classifier[label] = [pi[label]*i for i in P[label]]

In [49]:
classifier

{1: [0.0020505371662347983,
  0.0011947155404612017,
  0.00076938077929077867,
  0.00078082989803022465,
  0.0011686687953289619,
  0.00099406973455240868,
  0.00065488959189631752,
  0.00089904204901500588,
  0.00033574540703425719,
  0.00034719452577370333,
  0.00022926860275740839,
  0.00033259689938090953,
  0.00055843076651648403,
  0.00037152390309502629,
  0.00019663861434998694,
  0.00045367133005055216,
  0.00036980653528410939,
  0.00024615605289809138,
  0.00026447464288120514,
  0.00016143257422619015,
  0.00025359798007873133,
  0.00020264940168819616,
  0.00029109384395041734,
  0.0001828996718626516,
  0.00011735346707932265,
  0.00013137863753514412,
  0.00016000143438375941,
  0.00021867816792342073,
  0.00018433081170508237,
  0.00021066378480580843,
  0.00012107443066964262,
  0.00015628047079343943,
  0.00021552966027007305,
  0.00019864221012939,
  0.000144258896117021,
  0.00020808773308943307,
  0.00019377633466512544,
  0.00014683494783339636,
  0.00018232721592

In [53]:
#Store Classifier on JSON file
f = open('classified.json', 'a')
json.dump(classifier, f)
f.close()