In [38]:
# Using Gensim

from gensim import corpora
from gensim.models import TfidfModel

f = open("mycorpus.txt")
text = [[word for word in line.lower().split()] for line in f]
        

# Gensim implementation of TfIdf
class MyCorpus(object):
    def __iter__(self):
        data_file = open("mycorpus.txt")
        for line in data_file:
            yield dictionary.doc2bow(line.lower().split())


from six import iteritems
import itertools
stoplist = set('for a of the and to in'.split())
dictionary = corpora.Dictionary(text)
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist
            if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
dictionary.filter_tokens(stop_ids+once_ids)  # remove stop words and words that appear only once
dictionary.compactify()  

corpus = MyCorpus()
test_corporus = open("mycorpus.txt")

tfidf = TfidfModel(corpus)

for _corpus in corpus:
    print(tfidf[_corpus])

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555)]
[(2, 0.5710059809418182), (5, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(1, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (6, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


In [48]:
# Using pure python

import math
documents = [['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]
words_freq = {}
occurences = {}
for i, j in enumerate(documents):
    for k in j:
        if k in words_freq and words_freq[k][1] != i:
            words_freq[k][0] += 1
            words_freq[k][1] = i
            occurences[(k, i)] = 1
        elif k in words_freq:
            occurences[(k, i)] += 1
        else:
            words_freq[k] = [1, i]
            occurences[(k, i)] = 1

norm = [0 for i in range(9)]
scores = {}
for i in occurences:
    scores[i] = occurences[i] * ((math.log(9/float(words_freq[i[0]][0]), 2.0)))
    norm[i[1]] += scores[i]**2

for i in scores:
    print(i, scores[i]/ float(math.sqrt(norm[i[1]])))


(('system', 1), 0.3244870206138555)
(('minors', 7), 0.695546419520037)
(('eps', 3), 0.49182558987264147)
(('trees', 7), 0.5080429008916749)
(('trees', 6), 0.7071067811865475)
(('user', 4), 0.45889394536615247)
(('interface', 2), 0.5710059809418182)
(('computer', 0), 0.5773502691896257)
(('eps', 2), 0.5710059809418182)
(('graph', 8), 0.45889394536615247)
(('minors', 8), 0.6282580468670046)
(('survey', 1), 0.44424552527467476)
(('response', 1), 0.44424552527467476)
(('time', 1), 0.44424552527467476)
(('human', 3), 0.49182558987264147)
(('system', 3), 0.7184811607083769)
(('user', 2), 0.4170757362022777)
(('response', 4), 0.6282580468670046)
(('interface', 0), 0.5773502691896257)
(('human', 0), 0.5773502691896257)
(('graph', 7), 0.5080429008916749)
(('time', 4), 0.6282580468670046)
(('system', 2), 0.4170757362022777)
(('trees', 5), 1.0)
(('user', 1), 0.3244870206138555)
(('survey', 8), 0.6282580468670046)
(('computer', 1), 0.44424552527467476)
(('graph', 6), 0.7071067811865475)


In [49]:
# Using Scikit Learn

from sklearn.feature_extraction.text import TfidfVectorizer

documents = ["human interface computer",
 "survey user computer system response time",
 "eps user interface system",
 "system human system eps",
 "user response time",
 "trees",
 "graph trees",
 "graph minors trees",
 "graph minors survey"]

vectorizer = TfidfVectorizer(stop_words=stoplist)

X = vectorizer.fit_transform(documents)
print(X.toarray())


    


[[0.57735027 0.         0.         0.57735027 0.57735027 0.
  0.         0.         0.         0.         0.         0.        ]
 [0.42593857 0.         0.         0.         0.         0.
  0.42593857 0.42593857 0.37034129 0.42593857 0.         0.37034129]
 [0.         0.53361154 0.         0.         0.53361154 0.
  0.         0.         0.46395983 0.         0.         0.46395983]
 [0.         0.44614767 0.         0.44614767 0.         0.
  0.         0.         0.77582505 0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.6023681  0.         0.         0.6023681  0.         0.52374168]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         1.         0.        ]
 [0.         0.         0.70710678 0.         0.         0.
  0.         0.         0.         0.         0.70710678 0.        ]
 [0.         0.         0.54859115 0.         0.         0.63094809
  0.         0.         0.   