In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
corpus = ['The sun is shining',
          'The weather is sweet',
          'The sun is shining, the weather is sweet,and one and one is two']

In [3]:
count = CountVectorizer()

In [4]:
# create vocabulary of the bag-of-words model and transformed the following three sentences into sparse feature vectors
bag = count.fit_transform(corpus)

In [5]:
bag

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 17 stored elements and shape (3, 9)>

In [6]:
# the vocabulary is stored in a Python dictionary that maps the unique words to integer indices
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [7]:
# These values in the feature vectors are also called the raw term frequencies
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


#### use tfidf transformer from sklearn:

`Purpose`: This transformer converts a count matrix (a matrix of term/token counts) to a normalized TF-IDF representation.

`Input`: It takes a term-document matrix (e.g., from CountVectorizer) where the matrix elements are counts of the terms.

`Use Case`: Use TfidfTransformer when you already have a count matrix and you want to transform it into TF-IDF features.

In [8]:
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np

np.set_printoptions(precision=2)
tfidf = TfidfTransformer(use_idf=True,norm='l2',smooth_idf=True)
print(tfidf.fit_transform(count.fit_transform(corpus)).toarray())


[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


#### use tfidf vectorizer from sklearn

`Purpose`: This vectorizer directly converts a collection of raw documents to a matrix of TF-IDF features.

`Input`: It takes a list of raw documents (strings).

`Use Case`: Use TfidfVectorizer when you have raw text data and you want to directly convert it into TF-IDF features.

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the TfidfVectorizer with settings equivalent to your TfidfTransformer
tfidf_vectorizer = TfidfVectorizer(
    use_idf=True,
    norm='l2',
    smooth_idf=True
)

print(tfidf_vectorizer.fit_transform(corpus).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


#### as observed both class gives similar output when similar params are used.