In [87]:
import os
import pandas as pd
import numpy
import xml.etree.ElementTree as ET 
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

## Accessing required Files

In [88]:
files = os.listdir("business")
all_files = []
for x in files:
    if x[-4:] == "utf8":
        all_files.append(x)
files = all_files

In [89]:
basePath = "business/"
def getFileName(num):
    return basePath+files[num]

## Computing TF, IDF, TF-IDF Scores

In [90]:
def computeTF(wordDict, words):
    tfDict = {}
    wordCount = len(words)
    for word, count in wordDict.items():
        tfDict[word] = count/float(wordCount)
    return tfDict

In [91]:
def computeIDF(docList):
    import math
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / float(val))
        
    return idfDict

In [92]:
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf

## Extract Text using XML parser

In [93]:
def getText(xmlFilePath):
    xmlFile = open(xmlFilePath)
    tree = ET.parse(xmlFile)
    root = tree.getroot()
    return root[1].text

## Pre-processing (Tokenization)

In [94]:
def preprocess_part1(text):
    text = text.strip()
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    tokens = tokenizer.tokenize(text)
    return tokens

## Preprocessing (Stop word removal & Lemmatization)

In [95]:
def preprocess_part2(text):
    tokens = preprocess_part1(text)
    text = [word.lower() for word in tokens]
    lemmatizer = WordNetLemmatizer()
    tokens = list(filter(lambda token: token not in stopwords.words("english"),text))
    tokens2 = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens2

## Processing five files by my approach

In [96]:
# Common pre-processing as done in assignment 1
text1 = preprocess_part2(getText(getFileName(0)))
text2 = preprocess_part2(getText(getFileName(1)))
text3 = preprocess_part2(getText(getFileName(2)))
text4 = preprocess_part2(getText(getFileName(3)))
text5 = preprocess_part2(getText(getFileName(4)))

In [97]:
# considering unique words from all five files
word_union = set(text1).union(text2).union(text3).union(text4).union(text5)

In [98]:
wordDict1 = dict.fromkeys(word_union, 0)
wordDict2 = dict.fromkeys(word_union, 0)
wordDict3 = dict.fromkeys(word_union, 0)
wordDict4 = dict.fromkeys(word_union, 0)
wordDict5 = dict.fromkeys(word_union, 0)

In [99]:
# Forming Word Dictionary by counting Frequency of each word

for word in text1:
    wordDict1[word]+=1
for word in text2:
    wordDict2[word]+=1
for word in text3:
    wordDict3[word]+=1
for word in text4:
    wordDict4[word]+=1
for word in text5:
    wordDict5[word]+=1

In [100]:
pd.DataFrame([wordDict1, wordDict2, wordDict3, wordDict4, wordDict5])

Unnamed: 0,various,exchange,lanka,anup,announced,malaysia,superb,entrepolis,banking,earlier,...,2007,slr,association,moving,model,generate,capability,9,procurement,minimise
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,1,1,0,0,1,0,1,0,0,1
2,0,3,0,1,1,0,1,0,0,1,...,0,0,1,0,0,1,0,1,1,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,1,0,1,0,1,1,0,0,0,3,...,0,0,0,1,0,0,0,1,0,0


In [101]:
tf1 = computeTF(wordDict1, text1)
tf2 = computeTF(wordDict2, text2)
tf3 = computeTF(wordDict3, text3)
tf4 = computeTF(wordDict4, text4)
tf5 = computeTF(wordDict5, text5)

In [102]:
idfs = computeIDF([wordDict1, wordDict2, wordDict3, wordDict4, wordDict5])

In [103]:
tfidf1 = computeTFIDF(tf1, idfs)
tfidf2 = computeTFIDF(tf2, idfs)
tfidf3 = computeTFIDF(tf3, idfs)
tfidf4 = computeTFIDF(tf4, idfs)
tfidf5 = computeTFIDF(tf5, idfs)

In [104]:
pd.DataFrame([tfidf1, tfidf2, tfidf3, tfidf4, tfidf5])

Unnamed: 0,various,exchange,lanka,anup,announced,malaysia,superb,entrepolis,banking,earlier,...,2007,slr,association,moving,model,generate,capability,9,procurement,minimise
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002589,0.0,0.0,...,0.002589,0.002589,0.0,0.0,0.002589,0.0,0.001474,0.0,0.0,0.002589
2,0.0,0.004809,0.0,0.001603,0.000913,0.0,0.001603,0.0,0.0,0.000913,...,0.0,0.0,0.001603,0.0,0.0,0.001603,0.0,0.000913,0.001603,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003584,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.002041,0.0,0.0,0.0
4,0.002219,0.0,0.002219,0.0,0.001263,0.002219,0.0,0.0,0.0,0.00379,...,0.0,0.0,0.0,0.002219,0.0,0.0,0.0,0.001263,0.0,0.0


## TFID-F Vectorization using sklearn

In [105]:
def listToString(s):  
    str1 = ""  
    for ele in s:  
        str1 += (" "+ele)
    return str1  

In [106]:
t1 = listToString(text1)
t2 = listToString(text2)
t3 = listToString(text3)
t4 = listToString(text4)
t5 = listToString(text5)
list_raw_text = [t1, t2, t3, t4, t5] 

In [107]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(list_raw_text)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
dataframe = pd.DataFrame(denselist, columns=feature_names)
dataframe

Unnamed: 0,000,05,10,11,1164,15,170,180,181,187,...,welcomed,well,west,witnessed,world,would,yadav,yesterday,yogendra,yojna
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.043725,0.0,0.054196,0.0,0.043725,0.0,0.108393,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.072592,0.054196,0.0,0.0,0.0
2,0.053792,0.0,0.066978,0.0,0.033337,0.0,0.0,0.0,0.0,0.0,...,0.033337,0.053792,0.0,0.0,0.033337,0.066978,0.0,0.0,0.033337,0.033337
3,0.099673,0.0,0.041369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.049836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.031328,0.026005,0.0,0.0,0.031328,0.03883,0.0,0.03883,0.03883,...,0.0,0.0,0.077659,0.03883,0.0,0.052009,0.0,0.03883,0.0,0.0


## My approach vs sklearn

In [112]:
# Comparing keywords of 5th document

import operator
import itertools
sorted_d = dict(sorted(tfidf5.items(), key=operator.itemgetter(1),reverse=True))
print('My approach: Top 10 keywords by tf-idf score : ')
out = dict(itertools.islice(sorted_d.items(), 10))  
out

My approach: Top 10 keywords by tf-idf score : 


{'bharat': 0.02440847634189272,
 'petro': 0.01775161915774016,
 'corporation': 0.01775161915774016,
 'refinery': 0.011094761973587601,
 'price': 0.011094761973587601,
 'oil': 0.00887580957887008,
 'kochi': 0.00887580957887008,
 'merger': 0.00887580957887008,
 'petroleum': 0.006656857184152561,
 'behuria': 0.006656857184152561}

In [113]:
rowData = dataframe.loc[ 4 , : ]
d = rowData.to_dict()
sorted_d = dict(sorted(d.items(), key=operator.itemgetter(1),reverse=True))
print('Sklearn: Top 10 keywords by tf-idf score : \n')
out = dict(itertools.islice(sorted_d.items(), 10))  
print(out)

Sklearn: Top 10 keywords by tf-idf score : 

{'bharat': 0.4271266676165389, 'corporation': 0.31063757644839196, 'petro': 0.31063757644839196, 'price': 0.19414848528024498, 'refinery': 0.19414848528024498, 'kochi': 0.15531878822419598, 'merger': 0.15531878822419598, 'oil': 0.15531878822419598, 'high': 0.12531022037266545, 'behuria': 0.11648909116814699}


###  Conclusion: In both the approaches, 9 out of top 10 keywords are same although having different tf-idf scores. Thus we can safely say that both approaches are correct and can be used for ranking.