In [1]:
import nltk

import re

import heapq

import numpy as np

In [3]:
paragraph = """Thank you all so very much. Thank you to the Academy. 

               Thank you to all of you in this room. I have to congratulate 

               the other incredible nominees this year. The Revenant was 

               the product of the tireless efforts of an unbelievable cast

               and crew. First off, to my brother in this endeavor, Mr. Tom 

               Hardy. Tom, your talent on screen can only be surpassed by 

               your friendship off screen … thank you for creating a t

               ranscendent cinematic experience. Thank you to everybody at 

               Fox and New Regency … my entire team. I have to thank 

               everyone from the very onset of my career … To my parents; 

               none of this would be possible without you. And to my 

               friends, I love you dearly; you know who you are. And lastly,

               I just want to say this: Making The Revenant was about

               man's relationship to the natural world. A world that we

               collectively felt in 2015 as the hottest year in recorded

               history. Our production needed to move to the southern

               tip of this planet just to be able to find snow. Climate

               change is real, it is happening right now. It is the most

               urgent threat facing our entire species, and we need to work

               collectively together and stop procrastinating. We need to

               support leaders around the world who do not speak for the 

               big polluters, but who speak for all of humanity, for the

               indigenous people of the world, for the billions and 

               billions of underprivileged people out there who would be

               most affected by this. For our children’s children, and 

               for those people out there whose voices have been drowned

               out by the politics of greed. I thank you all for this 

               amazing award tonight. Let us not take this planet for 

               granted. I do not take tonight for granted. Thank you so very much."""

In [4]:
# Tokenize sentences

dataset = nltk.sent_tokenize(paragraph)

for i in range(len(dataset)):

    dataset[i] = dataset[i].lower()

    dataset[i] = re.sub(r'\W',' ',dataset[i])

    dataset[i] = re.sub(r'\s+',' ',dataset[i])

In [5]:
# Creating word histogram

word2count = {}

for data in dataset:

    words = nltk.word_tokenize(data)

    for word in words:

        if word not in word2count.keys():

            word2count[word] = 1

        else:

            word2count[word] += 1

In [6]:
# Selecting best 100 features

freq_words = heapq.nlargest(100,word2count,key=word2count.get)

In [7]:
# IDF Dictionary

word_idfs = {}

for word in freq_words:

    doc_count = 0

    for data in dataset:

        if word in nltk.word_tokenize(data):

            doc_count += 1

    word_idfs[word] = np.log(len(dataset)/(1+doc_count))

In [8]:
# TF Matrix

tf_matrix = {}

for word in freq_words:

    doc_tf = []

    for data in dataset:

        frequency = 0

        for w in nltk.word_tokenize(data):

            if word == w:

                frequency += 1

        tf_word = frequency/len(nltk.word_tokenize(data))

        doc_tf.append(tf_word)

    tf_matrix[word] = doc_tf

In [9]:
tf_matrix

{'the': [0.0,
  0.2,
  0.0,
  0.1,
  0.2,
  0.0,
  0.0,
  0.0,
  0.043478260869565216,
  0.0,
  0.1,
  0.06666666666666667,
  0.05263157894736842,
  0.0,
  0.05,
  0.10638297872340426,
  0.045454545454545456,
  0.0,
  0.0,
  0.0,
  0.0],
 'to': [0.0,
  0.2,
  0.1111111111111111,
  0.1,
  0.0,
  0.09090909090909091,
  0.0,
  0.08333333333333333,
  0.08695652173913043,
  0.07692307692307693,
  0.1,
  0.0,
  0.21052631578947367,
  0.0,
  0.05,
  0.02127659574468085,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'you': [0.16666666666666666,
  0.2,
  0.2222222222222222,
  0.0,
  0.0,
  0.0,
  0.043478260869565216,
  0.08333333333333333,
  0.043478260869565216,
  0.23076923076923078,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.1111111111111111,
  0.0,
  0.0,
  0.2],
 'of': [0.0,
  0.0,
  0.1111111111111111,
  0.0,
  0.13333333333333333,
  0.0,
  0.0,
  0.0,
  0.08695652173913043,
  0.0,
  0.0,
  0.0,
  0.05263157894736842,
  0.0,
  0.0,
  0.06382978723404255,
  0.045454545454545456,
  0.0,
 

In [10]:
# Creating the Tf-Idf Model

tfidf_matrix = []

for word in tf_matrix.keys():

    tfidf = []

    for value in tf_matrix[word]:

        score = value * word_idfs[word]

        tfidf.append(score)

    tfidf_matrix.append(tfidf)   

In [11]:
tfidf_matrix

[[0.0,
  0.1293254329850105,
  0.0,
  0.06466271649250525,
  0.1293254329850105,
  0.0,
  0.0,
  0.0,
  0.028114224561958803,
  0.0,
  0.06466271649250525,
  0.04310847766167016,
  0.034033008680265917,
  0.0,
  0.03233135824625263,
  0.06879012392819707,
  0.029392143860229657,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.0,
  0.11192315758708454,
  0.062179531992824735,
  0.05596157879354227,
  0.0,
  0.05087416253958388,
  0.0,
  0.046634648994618555,
  0.04866224242916719,
  0.04304736830272482,
  0.05596157879354227,
  0.0,
  0.11781385009166792,
  0.0,
  0.027980789396771136,
  0.011906718892243035,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.12365622412156288,
  0.14838746894587548,
  0.16487496549541716,
  0.0,
  0.0,
  0.0,
  0.0322581454230164,
  0.06182811206078144,
  0.0322581454230164,
  0.171216310322164,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.08243748274770858,
  0.0,
  0.0,
  0.14838746894587548],
 [0.0,
  0.0,
  0.12206803207423442,
  0.0,
  0.1464816384890813,
  0.0,
  

In [12]:
# Finishing the Tf-Tdf model

X = np.asarray(tfidf_matrix)



X = np.transpose(X)

In [13]:
X

array([[0.        , 0.        , 0.12365622, ..., 0.        , 0.        ,
        0.        ],
       [0.12932543, 0.11192316, 0.14838747, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.06217953, 0.16487497, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.14838747, ..., 0.        , 0.        ,
        0.        ]])