# Build IDF look up dictionary from a corpus

Reuters corpus used below. 

In [1]:
import nltk
import math
from nltk.corpus import reuters,brown
cfileids = reuters.fileids() # list of filenames in reuters corpus
wnl = nltk.WordNetLemmatizer()

Making a 1-gram dictionary of words and their document frequency

In [2]:
%%time 

document_freq = {} # Dictionary: key: word,  value:# of documents word occurs in 

for fname in cfileids:
    corptext = [wrd.lower() for wrd in reuters.words(fname) if wrd.isalpha()]
    corptextlemma = [wnl.lemmatize(wrd.lower()) for wrd in corptext]
    corptextlemmaset = set(corptextlemma)
    for wrd in corptextlemmaset:
        document_freq[wrd] = document_freq.get(wrd,0) + 1 

print('Total number of unique lemmatized words found: ',len(document_freq),' in a total of ',len(cfileids),' documents')

totaldocs = len(cfileids)

Total number of unique lemmatized words found:  26723  in a total of  10788  documents
Wall time: 15.3 s


In [3]:
idf_dict = {} # dictionary: key: word, value: idf value in corpus
for wrd in document_freq.keys():
    idf_dict[wrd] = math.log(totaldocs) - math.log(1+document_freq[wrd])

In [4]:
def get_idf(wrd):
    wrd = wrd.lower()
    return idf_dict.get(wrd,math.log(totaldocs))

In [5]:
print(get_idf('long'))
print(get_idf('memory'))
print(get_idf('fun'))

3.1355869158133407
6.2904574107056295
9.28618968425962


Definitely need a bigger corpus for this, in science and technology domain. (Reuters is news articles)
Plan to make idf dictionary based on wikidump

In [6]:
import pickle 
with open('idf_dict.pickle', 'wb') as f:
    pickle.dump(idf_dict, f)
f.close()