# Importance of TF IDF and usefulness of the vectors

In [1]:
corpus = [
...     'This is the first document',
...     'This document is the second document',
...     'And this is the third one',
...     'Is this the first document',
... ]
type(corpus)

list

In [2]:
corpus_listToStr = ' '.join([str(elem) for elem in corpus])

In [3]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(corpus_listToStr.lower())
print(tokens)

['this', 'is', 'the', 'first', 'document', 'this', 'document', 'is', 'the', 'second', 'document', 'and', 'this', 'is', 'the', 'third', 'one', 'is', 'this', 'the', 'first', 'document']


In [12]:
import numpy as np
document_tfidf_vectors = []
zero_vector = np.zeros([4, 4])

In [13]:
import copy
for doc in corpus:
    vec = copy.copy(zero_vector)

In [14]:
from collections import Counter
token_counts = Counter(tokens)
token_counts

Counter({'this': 4,
         'is': 4,
         'the': 4,
         'first': 2,
         'document': 4,
         'second': 1,
         'and': 1,
         'third': 1,
         'one': 1})

In [15]:
for key, value in token_counts.items():
    docs_containing_key = 0
    for _doc in corpus:
        if key in _doc:
            docs_containing_key += 1

In [16]:
lexicon = sorted(set(tokens))
tf = value/len(lexicon)
tf

0.1111111111111111

In [19]:
if docs_containing_key:
    idf = len(corpus)/docs_containing_key
else:
    idf = 0
    vec[key] = tf*idf
document_tfidf_vectors.append(vec)

In [11]:
idf = len(corpus)/docs_containing_key
idf

4.0

In [20]:
document_tfidf_vectors

[array([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])]

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
vectorizer = TfidfVectorizer()

In [27]:
X = vectorizer.fit_transform(corpus)

In [31]:
type(X)

scipy.sparse.csr.csr_matrix

In [33]:
import scipy
scipy.sparse.csr.csr_matrix.toarray(X)

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

### SEARCH A key word in corpus using TFIDF Vectors

In [39]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
# our corpus
data = ['I like dog', 'I love cat', 'I interested in cat']

In [40]:
cv = CountVectorizer()

In [41]:
# convert text data into term-frequency matrix
data = cv.fit_transform(data)
data

<3x6 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [42]:
scipy.sparse.csr.csr_matrix.toarray(data)

array([[0, 1, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 1],
       [1, 0, 1, 1, 0, 0]], dtype=int64)

In [43]:
tfidf_transformer = TfidfTransformer()

In [44]:
# convert term-frequency matrix into tf-idf
tfidf_matrix = tfidf_transformer.fit_transform(data)
tfidf_matrix

<3x6 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [45]:
scipy.sparse.csr.csr_matrix.toarray(tfidf_matrix)

array([[0.        , 0.70710678, 0.        , 0.        , 0.70710678,
        0.        ],
       [0.60534851, 0.        , 0.        , 0.        , 0.        ,
        0.79596054],
       [0.4736296 , 0.        , 0.62276601, 0.62276601, 0.        ,
        0.        ]])

In [47]:
# create dictionary to find a tfidf word each word
word2tfidf = dict(zip(cv.get_feature_names(), tfidf_transformer.idf_))
word2tfidf

{'cat': 1.2876820724517808,
 'dog': 1.6931471805599454,
 'in': 1.6931471805599454,
 'interested': 1.6931471805599454,
 'like': 1.6931471805599454,
 'love': 1.6931471805599454}

In [48]:
for word, score in word2tfidf.items():
    print(word, score)

cat 1.2876820724517808
dog 1.6931471805599454
in 1.6931471805599454
interested 1.6931471805599454
like 1.6931471805599454
love 1.6931471805599454


In [52]:
test_text = ['cat']
test_text = cv.fit_transform(test_text)
tfidf_matrix = tfidf_transformer.fit_transform(test_text)
tfidf_matrix

<1x1 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [53]:
scipy.sparse.csr.csr_matrix.toarray(tfidf_matrix)

array([[1.]])