In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer

In [2]:
corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',
]

<h3>Tokenizing</h3>
Tokenizing strings and giving an integer id for each possible token, 
for instance by using white-spaces and punctuation as token separators.
<h3>Counting </h3>
Counting the occurrences of tokens in each document.
<h3>Normalizing</h3>
Normalizing and weighting with diminishing importance tokens that occur in the majority of samples / documents.

### Bag of words
We call **vectorization** the general process of turning a collection of text documents into 
numerical feature vectors. 

This specific strategy (tokenization, counting and normalization) is called the Bag of Words or “Bag of n-grams” representation.  N is number of words. 1-gram means we take one word, 2-gram means we take two words together.

In [3]:
cv = CountVectorizer()
X = cv.fit_transform(corpus)

In [4]:
X

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [8]:
print(cv.get_feature_names_out())

['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


In [7]:
X.shape

(4, 9)

In [None]:
type(X)

In [None]:
X

In [5]:
print(X.toarray())  # Convert sparse matrix to dense matrix (ndarray)

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


## TfidfTransformer

Transform a count matrix to a normalized tf or tf-idf representation
<p/>
Tf means term-frequency while <b>tf-idf </b> means <b>term-frequency times inverse document-frequency </b>.

In [6]:
tf = TfidfTransformer()
XT = tf.fit_transform(X)
XT.shape

(4, 9)

In [7]:
XT.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

## TfidfVectorizer
Convert a collection of raw documents to a matrix of TF-IDF features

In [12]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

In [14]:
print(vectorizer.get_feature_names_out())

['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


In [15]:
print(X.shape)

(4, 9)


In [16]:
X.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

In [23]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X = vectorizer.fit_transform(corpus)

In [24]:
X.shape

(4, 22)

In [25]:
print(vectorizer.get_feature_names_out())

['and' 'and this' 'document' 'document is' 'first' 'first document' 'is'
 'is the' 'is this' 'one' 'second' 'second document' 'the' 'the first'
 'the second' 'the third' 'third' 'third one' 'this' 'this document'
 'this is' 'this the']


In [27]:
X

<4x22 sparse matrix of type '<class 'numpy.float64'>'
	with 39 stored elements in Compressed Sparse Row format>

In [None]:
X.toarray()

array([[0.        , 0.        , 0.3145322 , 0.        , 0.38850984,
        0.38850984, 0.25715068, 0.3145322 , 0.        , 0.        ,
        0.        , 0.        , 0.25715068, 0.38850984, 0.        ,
        0.        , 0.        , 0.        , 0.25715068, 0.        ,
        0.38850984, 0.        ],
       [0.        , 0.        , 0.45551258, 0.35682424, 0.        ,
        0.        , 0.18620569, 0.22775629, 0.        , 0.        ,
        0.35682424, 0.35682424, 0.18620569, 0.        , 0.35682424,
        0.        , 0.        , 0.        , 0.18620569, 0.35682424,
        0.        , 0.        ],
       [0.35700721, 0.35700721, 0.        , 0.        , 0.        ,
        0.        , 0.18630117, 0.22787308, 0.        , 0.35700721,
        0.        , 0.        , 0.18630117, 0.        , 0.        ,
        0.35700721, 0.35700721, 0.35700721, 0.18630117, 0.        ,
        0.28146859, 0.        ],
       [0.        , 0.        , 0.28293955, 0.        , 0.34948664,
        0.3494866