In [1]:
# Vectorization technique 2 : TFIDF (Term Frequency Inverse Document Frequency)

# TF refer to Term Frequenct
# IDF refer to Invarse Document Frequency

# TF IDF score: TF-IDF(d,w) = TF(d,w) * IDF(w)
# TF(d,w) : number of occurance of word 'w' in the instance of document 'd'
# IDF(w)  : log(total number of documents / number of documents contains word 'w')

# TFIDF is a statistical way of understaing the importance of a word in a set of documents

In [4]:
# for eaxmple if there are 10 documents 5 documents contains the word 'python'

# TFIDF of word 'python'

import math
idf = math.log(10/5) # = 0.6931471805599453
# tfidf of python in document contains 1 python
tfidf_1 = 1 * 0.6931471805599453

# tfidf of python in document contains 2 python
tfidf_2 = 2 * 0.6931471805599453

print(tfidf_1)
print(tfidf_2)

0.6931471805599453
1.3862943611198906


In [6]:
# idf score variation accoring to number of document with word increases
for i in range(1, 11):
    print(i, ' : ', math.log(10/i))

1  :  2.302585092994046
2  :  1.6094379124341003
3  :  1.2039728043259361
4  :  0.9162907318741551
5  :  0.6931471805599453
6  :  0.5108256237659907
7  :  0.3566749439387324
8  :  0.22314355131420976
9  :  0.10536051565782635
10  :  0.0


In [None]:
# IDF decreases if the word is present in more and more documents.

In [8]:
review1 = 'the movie was good and liked it'
review2 = 'the movie was good but end was boring'
review3 = 'did not like the movie as it was too lengthy'

rev_list = [review1, review2, review3]

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(min_df=1, lowercase=True, stop_words='english')

In [11]:
tf_matrix = tfv.fit_transform(rev_list)

In [12]:
tf_matrix

<3x8 sparse matrix of type '<class 'numpy.float64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [13]:
print(type(tf_matrix))

<class 'scipy.sparse.csr.csr_matrix'>


In [15]:
tf_matrix.shape

(3, 8)

In [16]:
tf_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.54783215, 0.        ,
        0.        , 0.72033345, 0.42544054],
       [0.5844829 , 0.        , 0.5844829 , 0.44451431, 0.        ,
        0.        , 0.        , 0.34520502],
       [0.        , 0.54645401, 0.        , 0.        , 0.54645401,
        0.54645401, 0.        , 0.32274454]])

In [18]:
import pandas as pd

df = pd.DataFrame(tf_matrix.toarray(), columns=tfv.get_feature_names())

In [19]:
df

Unnamed: 0,boring,did,end,good,lengthy,like,liked,movie
0,0.0,0.0,0.0,0.547832,0.0,0.0,0.720333,0.425441
1,0.584483,0.0,0.584483,0.444514,0.0,0.0,0.0,0.345205
2,0.0,0.546454,0.0,0.0,0.546454,0.546454,0.0,0.322745


In [20]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

cv_out = cv.fit_transform([review1, review2, review3])

In [21]:
pd.DataFrame(cv_out.toarray(), columns=cv.get_feature_names())

Unnamed: 0,and,as,boring,but,did,end,good,it,lengthy,like,liked,movie,not,the,too,was
0,1,0,0,0,0,0,1,1,0,0,1,1,0,1,0,1
1,0,0,1,1,0,1,1,0,0,0,0,1,0,1,0,2
2,0,1,0,0,1,0,0,1,1,1,0,1,1,1,1,1


In [22]:
str1 = 'i love natural language processing but with python'
str2 = 'i like image processing'
str3 = 'i like data processing and image processing'

In [23]:
# count vectorizer

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

cv_out = cv.fit_transform([str1, str2, str3])

In [24]:
pd.DataFrame(cv_out.toarray(), columns=cv.get_feature_names())

Unnamed: 0,and,but,data,image,language,like,love,natural,processing,python,with
0,0,1,0,0,1,0,1,1,1,1,1
1,0,0,0,1,0,1,0,0,1,0,0
2,1,0,1,1,0,1,0,0,2,0,0


In [26]:
# TFIDF

from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(min_df=1)

tfv_out = tfv.fit_transform([str1, str2, str3])

In [27]:
pd.DataFrame(tfv_out.toarray(), columns=tfv.get_feature_names())

Unnamed: 0,and,but,data,image,language,like,love,natural,processing,python,with
0,0.0,0.396875,0.0,0.0,0.396875,0.0,0.396875,0.396875,0.2344,0.396875,0.396875
1,0.0,0.0,0.0,0.619805,0.0,0.619805,0.0,0.0,0.481334,0.0,0.0
2,0.468699,0.0,0.468699,0.356457,0.0,0.356457,0.0,0.0,0.553642,0.0,0.0
