<a href="https://colab.research.google.com/github/srikz4/study/blob/main/keyword_tf_idf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# Keyword Analysis using TF-IDF

import numpy as np
import pandas as pd
import math
from nltk.corpus import stopwords
import json
import ast
import nltk
from nltk.tokenize import WhitespaceTokenizer, word_tokenize
import unicodedata
import sys

nltk.download('punkt')
nltk.download('stopwords')
w_token = WhitespaceTokenizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
def Read(filename):
    contents = ''
    with open('articles/' + filename, encoding='utf8') as f:
        contents = f.read()
    return contents

def Get(category):
    data = ''
    for file in categories[category]:
        data += Read(file)
    return data

def cleanup(data):
    data = data.lower()
    data = data.replace('\n', ' ').replace('\r', ' ').replace('\s', ' ')
    punctuations = '''!()-[]{};:’'\"\\,<>./?@#$%^&*_~'''

    for word in data:
        if word in punctuations:
            data = data.replace(word, '')
    
    tbl = dict.fromkeys(i for i in range(sys.maxunicode)
                      if unicodedata.category(chr(i)).startswith('P'))
    data.translate(tbl)
    
    data = ''.join([i for i in data if not i.isdigit()])
    data = [word for word in word_tokenize(data) if not word in stopwords.words('english')]
    
    return data

In [3]:
def BagOfWords(token):    
    termFrequencies[token] = 0
        
    for word in corpus_tokens:
        if token == word:
            termFrequencies[token] += 1
  
    return termFrequencies[token]

def IDF(token):
    return math.log(float(len(corpus_tokens)) / termFrequencies[token])

def calculate(data):
    data['TF'] = data.apply(lambda x: BagOfWords(x.Term), axis=1)
    data['IDF'] = data.apply(lambda x: IDF(x.Term), axis=1)
    data['TFIDF'] = data.apply(lambda x: x.TF * x.IDF, axis=1)
    return data

In [4]:
_COVID = 'covid'
_ARYANKHAN = 'aryankhan'
_T20 = 't20'

files = ['c1a1', 'c1a2', 'c1a3', 'c2a1', 'c2a2', 'c2a3', 'c3a1', 'c3a2', 'c3a3']
categories = { 'covid' : files[:3], 'aryankhan' : files[3:6], 't20' : files[6:9] }

data_set = [ _COVID, _ARYANKHAN, _T20 ]

top_2 = { _COVID : None, _ARYANKHAN : None, _T20 : None }

In [11]:
for _dataset in data_set:
    
    termFrequencies = {}
    corpus = Get(_dataset)

    corpus_tokens = cleanup(corpus)
    corpus_set = set(corpus_tokens)

    df = pd.DataFrame(corpus_set, columns=['Term'])
    df.style.hide_index()

    print('Total Words  : ' + str(len(corpus_tokens)))
    print('Unique Words : ' + str(len(corpus_set)))

    print('[' + _dataset + '] Data (after cleanup)')
    display(df)

    df = calculate(df)
    print('[' + _dataset + '] Data (with TF, IDF, TF-IDF)')
    display(df)

    top_2[_dataset] = None
    df.sort_values(by=['TFIDF'], inplace=True, ascending=False)
    top_2[_dataset] = df.loc[:1, ['Term']]
    
    del df
    
    print('\n')

Total Words  : 1127
Unique Words : 633
[covid] Data (after cleanup)


Unnamed: 0,Term
0,equitable
1,orders
2,tests
3,vaccinations
4,strengthening
...,...
628,millions
629,immunization
630,around
631,revoked


[covid] Data (with TF, IDF, TF-IDF)


Unnamed: 0,Term,TF,IDF,TFIDF
0,equitable,1,7.027315,7.027315
1,orders,1,7.027315,7.027315
2,tests,1,7.027315,7.027315
3,vaccinations,1,7.027315,7.027315
4,strengthening,1,7.027315,7.027315
...,...,...,...,...
628,millions,1,7.027315,7.027315
629,immunization,1,7.027315,7.027315
630,around,2,6.334167,12.668335
631,revoked,1,7.027315,7.027315




Total Words  : 1774
Unique Words : 749
[aryankhan] Data (after cleanup)


Unnamed: 0,Term
0,ncb
1,charged
2,documents
3,johar
4,preplanned
...,...
744,asaduddin
745,stand
746,events
747,accused


[aryankhan] Data (with TF, IDF, TF-IDF)


Unnamed: 0,Term,TF,IDF,TFIDF
0,ncb,26,4.222896,109.795286
1,charged,1,7.480992,7.480992
2,documents,3,6.382380,19.147140
3,johar,1,7.480992,7.480992
4,preplanned,2,6.787845,13.575690
...,...,...,...,...
744,asaduddin,1,7.480992,7.480992
745,stand,1,7.480992,7.480992
746,events,1,7.480992,7.480992
747,accused,6,5.689233,34.135396




Total Words  : 908
Unique Words : 445
[t20] Data (after cleanup)


Unnamed: 0,Term
0,franchise
1,accommodate
2,big
3,leaders
4,probably
...,...
440,given
441,wait
442,events
443,consistent


[t20] Data (with TF, IDF, TF-IDF)


Unnamed: 0,Term,TF,IDF,TFIDF
0,franchise,1,6.811244,6.811244
1,accommodate,1,6.811244,6.811244
2,big,2,6.118097,12.236194
3,leaders,2,6.118097,12.236194
4,probably,3,5.712632,17.137896
...,...,...,...,...
440,given,3,5.712632,17.137896
441,wait,1,6.811244,6.811244
442,events,1,6.811244,6.811244
443,consistent,1,6.811244,6.811244






In [12]:
print('\n')
for _dataset in data_set:
    print('[' + _dataset + '] Data (Top 10 Keywords)')
    print('-'*(24+len(_dataset)))
    for i in range(0, 10):
        print(top_2[_dataset]['Term'].values[i])
    print('\n')



[covid] Data (Top 10 Keywords)
-----------------------------
covid
health
pandemic
billion
testing
collection
customers
percent
vaccine
kit


[aryankhan] Data (Top 10 Keywords)
---------------------------------
aryan
khan
october
court
bail
ncb
jail
drugs
case
said


[t20] Data (Top 10 Keywords)
---------------------------
india
world
cup
afghanistan
team
win
new
also
pakistan
namibia


