In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data_set = ("Pizza is very tasty",
            "Pizza is famous all over the world",
            "Roganjosh is a cusine of Kashmir",
            "Pizza is famous in India as well, people love pizza")

### Countvectorizer

In [3]:
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit_transform(data_set)

<4x10 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [4]:
sparse_matrix = vectorizer.transform(data_set)
print(sparse_matrix)

  (0, 6)	1
  (0, 8)	1
  (1, 1)	1
  (1, 6)	1
  (1, 9)	1
  (2, 0)	1
  (2, 3)	1
  (2, 7)	1
  (3, 1)	1
  (3, 2)	1
  (3, 4)	1
  (3, 5)	1
  (3, 6)	2


In [5]:
sparse_matrix.todense()

matrix([[0, 0, 0, 0, 0, 0, 1, 0, 1, 0],
        [0, 1, 0, 0, 0, 0, 1, 0, 0, 1],
        [1, 0, 0, 1, 0, 0, 0, 1, 0, 0],
        [0, 1, 1, 0, 1, 1, 2, 0, 0, 0]], dtype=int64)

In [6]:
pd.DataFrame(sparse_matrix.toarray(), columns = vectorizer.get_feature_names())

Unnamed: 0,cusine,famous,india,kashmir,love,people,pizza,roganjosh,tasty,world
0,0,0,0,0,0,0,1,0,1,0
1,0,1,0,0,0,0,1,0,0,1
2,1,0,0,1,0,0,0,1,0,0
3,0,1,1,0,1,1,2,0,0,0


### TF-IDF

In [7]:
from math import log

In [None]:
idf(d, t) = log [ (1 + D) / 
           (1 + df(d, t)) ] + 1 

In [9]:
# IDF calculation of pizza word

#D = 4
#df(d,t) = 3

x = 1+4
y = 1+3

z = log(x/y) +1 
z

1.2231435513142097

In [8]:
tfidf_transform = TfidfTransformer() 
tfidf_transform.fit(sparse_matrix)
print("IDF:", tfidf_transform.idf_)

IDF: [1.91629073 1.51082562 1.91629073 1.91629073 1.91629073 1.91629073
 1.22314355 1.91629073 1.91629073 1.91629073]


In [9]:
tfidf_vect = TfidfVectorizer(stop_words='english')  
tfidf_vect.fit(data_set)
print("IDF:", tfidf_vect.idf_)

IDF: [1.91629073 1.51082562 1.91629073 1.91629073 1.91629073 1.91629073
 1.22314355 1.91629073 1.91629073 1.91629073]


In [10]:
tf = ([0, 1, 0, 0, 0, 0, 1, 0, 0, 1])

In [11]:
idf = tfidf_transform.idf_

In [12]:
tf_idf = tf * idf

In [13]:
tf_idf

array([0.        , 1.51082562, 0.        , 0.        , 0.        ,
       0.        , 1.22314355, 0.        , 0.        , 1.91629073])

### TfidfTransformer

In [14]:
tf_idf_matrix = tfidf_transform.transform(sparse_matrix)
print(tf_idf_matrix.todense())

[[0.         0.         0.         0.         0.         0.
  0.53802897 0.         0.84292635 0.        ]
 [0.         0.55349232 0.         0.         0.         0.
  0.44809973 0.         0.         0.70203482]
 [0.57735027 0.         0.         0.57735027 0.         0.
  0.         0.57735027 0.         0.        ]
 [0.         0.34405055 0.43638449 0.         0.43638449 0.43638449
  0.55707714 0.         0.         0.        ]]


### TfidfVectorizer

In [19]:
tf_idf_matrix = tfidf_vect.transform(data_set)
print(tf_idf_matrix.todense())

[[0.         0.         0.         0.         0.         0.
  0.53802897 0.         0.84292635 0.        ]
 [0.         0.55349232 0.         0.         0.         0.
  0.44809973 0.         0.         0.70203482]
 [0.57735027 0.         0.         0.57735027 0.         0.
  0.         0.57735027 0.         0.        ]
 [0.         0.34405055 0.43638449 0.         0.43638449 0.43638449
  0.55707714 0.         0.         0.        ]]


### Normalization by hand

In [20]:
tfidf_without_norm = [0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 1.22314355, 0.        , 1.91629073, 0.        ]

In [21]:
from numpy import array
from numpy.linalg import norm

a = array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 1.22314355, 0.        , 1.91629073, 0.        ])
l2 = norm(a)
print(l2)

2.2733786103046136


In [22]:
c = (a/l2)
c

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.53802897, 0.        , 0.84292635, 0.        ])