In [1]:
from nltk import tokenize
from operator import itemgetter
import math

In [2]:
doc = '''Widely used in knowledge-driven organizations, text mining is the process of examining large collections of documents to discover new information or help answer specific research questions.

Text mining identifies facts, relationships and assertions that would otherwise remain buried in the mass of textual big data. Once extracted, this information is converted into a structured form that can be further analyzed, or presented directly using clustered HTML tables, mind maps, charts, etc. Text mining employs a variety of methodologies to process the text, one of the most important of these being Natural Language Processing (NLP).

The structured data created by text mining can be integrated into databases, data warehouses or business intelligence dashboards and used for descriptive, prescriptive or predictive analytics.'''

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
stop_words = set(stopwords.words('english'))

In [4]:
total_words = doc.split()
total_word_length = len(total_words)
print(total_word_length)

120


In [5]:
total_sentences = tokenize.sent_tokenize(doc)
total_sent_len = len(total_sentences)
print(total_sent_len)

5


In [6]:
tf_score = {}
for each_word in total_words:
    each_word = each_word.replace('.','')
    if each_word not in stop_words:
        if each_word in tf_score:
            tf_score[each_word] += 1
        else:
            tf_score[each_word] = 1

# Dividing by total_word_length for each dictionary element
tf_score.update((x, y/int(total_word_length)) for x, y in tf_score.items())
print(tf_score)

{'Widely': 0.008333333333333333, 'used': 0.016666666666666666, 'knowledge-driven': 0.008333333333333333, 'organizations,': 0.008333333333333333, 'text': 0.016666666666666666, 'mining': 0.03333333333333333, 'process': 0.016666666666666666, 'examining': 0.008333333333333333, 'large': 0.008333333333333333, 'collections': 0.008333333333333333, 'documents': 0.008333333333333333, 'discover': 0.008333333333333333, 'new': 0.008333333333333333, 'information': 0.016666666666666666, 'help': 0.008333333333333333, 'answer': 0.008333333333333333, 'specific': 0.008333333333333333, 'research': 0.008333333333333333, 'questions': 0.008333333333333333, 'Text': 0.016666666666666666, 'identifies': 0.008333333333333333, 'facts,': 0.008333333333333333, 'relationships': 0.008333333333333333, 'assertions': 0.008333333333333333, 'would': 0.008333333333333333, 'otherwise': 0.008333333333333333, 'remain': 0.008333333333333333, 'buried': 0.008333333333333333, 'mass': 0.008333333333333333, 'textual': 0.008333333333

In [7]:
def check_sent(word, sentences): 
    final = [all([w in x for w in word]) for x in sentences] 
    sent_len = [sentences[i] for i in range(0, len(final)) if final[i]]
    return int(len(sent_len))

In [8]:
idf_score = {}
for each_word in total_words:
    each_word = each_word.replace('.','')
    if each_word not in stop_words:
        if each_word in idf_score:
            idf_score[each_word] = check_sent(each_word, total_sentences)
        else:
            idf_score[each_word] = 1

# Performing a log and divide
idf_score.update((x, math.log(int(total_sent_len)/y)) for x, y in idf_score.items())

print(idf_score)

{'Widely': 1.6094379124341003, 'used': 0.0, 'knowledge-driven': 1.6094379124341003, 'organizations,': 1.6094379124341003, 'text': 0.0, 'mining': 0.0, 'process': 0.0, 'examining': 1.6094379124341003, 'large': 1.6094379124341003, 'collections': 1.6094379124341003, 'documents': 1.6094379124341003, 'discover': 1.6094379124341003, 'new': 1.6094379124341003, 'information': 0.0, 'help': 1.6094379124341003, 'answer': 1.6094379124341003, 'specific': 1.6094379124341003, 'research': 1.6094379124341003, 'questions': 1.6094379124341003, 'Text': 0.22314355131420976, 'identifies': 1.6094379124341003, 'facts,': 1.6094379124341003, 'relationships': 1.6094379124341003, 'assertions': 1.6094379124341003, 'would': 1.6094379124341003, 'otherwise': 1.6094379124341003, 'remain': 1.6094379124341003, 'buried': 1.6094379124341003, 'mass': 1.6094379124341003, 'textual': 1.6094379124341003, 'big': 1.6094379124341003, 'data': 0.0, 'Once': 1.6094379124341003, 'extracted,': 1.6094379124341003, 'converted': 1.60943791

In [9]:
tf_idf_score = {key: tf_score[key] * idf_score.get(key, 0) for key in tf_score.keys()}
print(tf_idf_score)

{'Widely': 0.013411982603617503, 'used': 0.0, 'knowledge-driven': 0.013411982603617503, 'organizations,': 0.013411982603617503, 'text': 0.0, 'mining': 0.0, 'process': 0.0, 'examining': 0.013411982603617503, 'large': 0.013411982603617503, 'collections': 0.013411982603617503, 'documents': 0.013411982603617503, 'discover': 0.013411982603617503, 'new': 0.013411982603617503, 'information': 0.0, 'help': 0.013411982603617503, 'answer': 0.013411982603617503, 'specific': 0.013411982603617503, 'research': 0.013411982603617503, 'questions': 0.013411982603617503, 'Text': 0.0037190591885701628, 'identifies': 0.013411982603617503, 'facts,': 0.013411982603617503, 'relationships': 0.013411982603617503, 'assertions': 0.013411982603617503, 'would': 0.013411982603617503, 'otherwise': 0.013411982603617503, 'remain': 0.013411982603617503, 'buried': 0.013411982603617503, 'mass': 0.013411982603617503, 'textual': 0.013411982603617503, 'big': 0.013411982603617503, 'data': 0.0, 'Once': 0.013411982603617503, 'ex

In [10]:
def get_top_n(dict_elem, n):
    result = dict(sorted(dict_elem.items(), key = itemgetter(1), reverse = True)[:n]) 
    return result

In [11]:
print(get_top_n(tf_idf_score, 5))

{'Widely': 0.013411982603617503, 'knowledge-driven': 0.013411982603617503, 'organizations,': 0.013411982603617503, 'examining': 0.013411982603617503, 'large': 0.013411982603617503}
