In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',
]

<h3>Tokenizing</h3>
Tokenizing strings and giving an integer id for each possible token, 
for instance by using white-spaces and punctuation as token separators.
<h3>Counting </h3>
Counting the occurrences of tokens in each document.
<h3>Normalizing</h3>
Normalizing and weighting with diminishing importance tokens that occur in the majority of samples / documents.

### Bag of words
We call vectorization the general process of turning a collection of text documents into 
numerical feature vectors. 

This specific strategy (tokenization, counting and normalization) is called the Bag of Words or “Bag of n-grams” representation.

In [3]:
cv = CountVectorizer()
X = cv.fit_transform(corpus)

In [4]:
print(cv.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [5]:
X.shape

(4, 9)

In [6]:
type(X)

scipy.sparse.csr.csr_matrix

In [7]:
X

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [7]:
print(X.toarray())

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


## TfidfTransformer

Transform a count matrix to a normalized tf or tf-idf representation
<p/>
Tf means term-frequency while tf-idf means term-frequency times inverse document-frequency

In [8]:
from sklearn.feature_extraction.text import TfidfTransformer

In [9]:
tf = TfidfTransformer()
XT = tf.fit_transform(X)
XT.shape

(4, 9)

In [10]:
XT.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

## TfidfVectorizer

In [11]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

In [12]:
print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [13]:
print(X.shape)

(4, 9)


In [14]:
X.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])