In [108]:
# Importing Libraries
from nltk import tokenize
from operator import itemgetter
import math

In [None]:
# TF (Term Frequency) = Number of times a term t appears in the text / Total number of words in the document
# IDF (Inverse Document Frequency) = log(total number of sentences / Number of sentences with term t)
# TF-IDF = TF * IDF = More TF-IDF value, more important is the variable

In [None]:
# Document -> Vectorize -> Find TF -> Find IDF -> Find TF*IDF -> Keywords

In [90]:
# Document text
#doc = 'I am a graduate. I want to learn Python. I like learning Python. Python is easy. Python is interesting'
doc = 'Python is dynamically typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly, procedural), object-oriented, and functional programming. Python is often described as a "batteries included" language due to its comprehensive standard library. Python was created in the late 1980s, and first released in 1991, by Guido van Rossum as a successor to the ABC programming language.'

In [91]:
# Remove stopwords
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
stop_words = set(stopwords.words('english')) 

In [92]:
# Step 1 : Find total words in the document
total_words = doc.split()
total_word_length = len(total_words)
print(total_word_length)

58


In [93]:
# Step 2 : Find total number of sentences
total_sentences = tokenize.sent_tokenize(doc)
total_sent_len = len(total_sentences)
print(total_sent_len)

4


In [94]:
# Step 3: Calculate TF for each word
tf_score = {}
for each_word in total_words:
    each_word = each_word.replace('.','')
    if each_word not in stop_words:
        if each_word in tf_score:
            tf_score[each_word] += 1
        else:
            tf_score[each_word] = 1
print(tf_score)

# Dividing by total_word_length for each dictionary element
tf_score.update((x, y/int(total_word_length)) for x, y in tf_score.items())

print(tf_score)

{'paradigms,': 1, 'van': 1, 'Rossum': 1, 'ABC': 1, 'late': 1, '(particularly,': 1, 'It': 1, 'standard': 1, 'object-oriented,': 1, 'due': 1, 'successor': 1, 'library': 1, 'typed': 1, 'dynamically': 1, 'often': 1, 'described': 1, 'functional': 1, 'multiple': 1, '1991,': 1, 'first': 1, 'programming': 3, 'released': 1, 'Guido': 1, 'including': 1, '1980s,': 1, 'supports': 1, 'structured': 1, 'language': 2, 'created': 1, 'garbage-collected': 1, 'procedural),': 1, 'included"': 1, 'comprehensive': 1, 'Python': 3, '"batteries': 1}
{'paradigms,': 0.017241379310344827, 'van': 0.017241379310344827, 'Rossum': 0.017241379310344827, 'ABC': 0.017241379310344827, 'late': 0.017241379310344827, '(particularly,': 0.017241379310344827, 'It': 0.017241379310344827, 'standard': 0.017241379310344827, 'object-oriented,': 0.017241379310344827, 'due': 0.017241379310344827, 'successor': 0.017241379310344827, 'library': 0.017241379310344827, 'typed': 0.017241379310344827, 'dynamically': 0.017241379310344827, 'often

In [95]:
# Check if a word is there in sentence list
def check_sent(word, sentences): 
    final = [all([w in x for w in word]) for x in sentences] 
    sent_len = [sentences[i] for i in range(0, len(final)) if final[i]]
    return int(len(sent_len))


# Step 4: Calculate IDF for each word
idf_score = {}
for each_word in total_words:
    each_word = each_word.replace('.','')
    if each_word not in stop_words:
        if each_word in idf_score:
            idf_score[each_word] = check_sent(each_word, total_sentences)
        else:
            idf_score[each_word] = 1

print(idf_score)

{'paradigms,': 1, 'van': 1, 'Rossum': 1, 'ABC': 1, 'late': 1, '(particularly,': 1, 'It': 1, 'standard': 1, 'object-oriented,': 1, 'due': 1, 'successor': 1, 'library': 1, 'typed': 1, 'dynamically': 1, 'often': 1, 'described': 1, 'functional': 1, 'multiple': 1, '1991,': 1, 'first': 1, 'programming': 4, 'released': 1, 'Guido': 1, 'including': 1, '1980s,': 1, 'supports': 1, 'structured': 1, 'language': 3, 'created': 1, 'garbage-collected': 1, 'procedural),': 1, 'included"': 1, 'comprehensive': 1, 'Python': 3, '"batteries': 1}


In [109]:
# Step 5: Calculating TF*IDF
tf_idf_score = {key: math.log(tf_score[key] * idf_score.get(key, 0)) for key in tf_score.keys()} 
print(tf_idf_score)

{'paradigms,': -4.060443010546419, 'first': -4.060443010546419, 'supports': -4.060443010546419, 'ABC': -4.060443010546419, 'released': -4.060443010546419, '(particularly,': -4.060443010546419, 'It': -4.060443010546419, 'standard': -4.060443010546419, 'object-oriented,': -4.060443010546419, 'due': -4.060443010546419, 'typed': -4.060443010546419, 'dynamically': -4.060443010546419, 'often': -4.060443010546419, 'created': -4.060443010546419, 'successor': -4.060443010546419, 'programming': -1.575536360758419, 'multiple': -4.060443010546419, 'described': -4.060443010546419, 'functional': -4.060443010546419, 'van': -4.060443010546419, 'including': -4.060443010546419, 'library': -4.060443010546419, '1991,': -4.060443010546419, 'garbage-collected': -4.060443010546419, '1980s,': -4.060443010546419, 'Rossum': -4.060443010546419, 'structured': -4.060443010546419, 'language': -2.268683541318364, 'late': -4.060443010546419, 'Guido': -4.060443010546419, 'procedural),': -4.060443010546419, 'included"'

In [110]:
# Get top N important words in the document
def get_top_n(dict_elem, n):
    result = dict(sorted(dict_elem.items(), key = itemgetter(1), reverse = True)[:n]) 
    return result

In [111]:
print(get_top_n(tf_idf_score, 5))

{'paradigms,': -4.060443010546419, 'first': -4.060443010546419, 'Python': -1.8632184332102, 'programming': -1.575536360758419, 'language': -2.268683541318364}
