In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer

In [3]:
corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',
]

# TOKENIZING
### TOKENIZING A STRING AND GIVING AN INTEGER ID FOR EACH POSSIBLE TOKEN, FOR INSTANCES BY USING-WHITE SPACES AND PUNCTUATION AS TOKEN SEPARATORS

# COUNTING
### COUNTING THE OCCURRENCES OF TOKENS IN EACH DOCUMENT

# NORMALIZING
### NORMALIZING AND WEIGHTING WITH DIMENSIONING IMPORTANCE TOKENS THAT OCCUR IN THE MAJORITY OF SAMPLES/DOCUMENTS

# BAG OF WORDS
### WE CALL VECTORIZATION THE GENERAL PROCESS OF TURNING A COLLECTION OF TEXT DOCUMENTS INTO NUMERICAL FEATURE VECTORS

### THIS SPECIFY STRATEGY(TOKENIZING, COUNTING AND NORMALIZING) IS CALLED THE BAG OF WORDS OR "BAG OF n-grams" REPRESENTATION. N IS NUMBER OF WORDS. 1-GRAM MEANS WE TAKE ONE WORD, 2-GRAM MEANS WE TAKE TWO WORDS

In [4]:
cv = CountVectorizer()
X = cv.fit_transform(corpus)

In [5]:
print(cv.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']




In [6]:
X.shape

(4, 9)

In [7]:
type(X)

scipy.sparse.csr.csr_matrix

In [8]:
X

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [10]:
print(X.toarray())  # CONVERT SPARSE MATRIX TO DENSE MATRIX (NDARRAY)

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


# TFIDF TRANSFORMER

### TRANSFORM A COUNT MATRIX TO A NORMALIZED TF (OR) TF-IDF REPRESENTATION

### TF MEANS TERM FREQUENCY, WHILE TF-IDF MEANS TERM FREQUENCY TIMES INVERSE DOCUMENT FREQUENCY

In [11]:
tf = TfidfTransformer()
XT = tf.fit_transform(X)
XT.shape

(4, 9)

In [12]:
XT.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

# TfidfVectorizer
### CONVERT A COLLECTION OF RAW DOCUMENTS TO A MATRIX OF TF-IDF FEATURES

In [13]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

In [14]:
print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [15]:
print(X.shape)

(4, 9)


In [16]:
X.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

In [21]:
vectorizer = TfidfVectorizer(ngram_range = (1, 3))
X = vectorizer.fit_transform(corpus)

In [22]:
X.shape

(4, 34)

In [23]:
print(vectorizer.get_feature_names())

['and', 'and this', 'and this is', 'document', 'document is', 'document is the', 'first', 'first document', 'is', 'is the', 'is the first', 'is the second', 'is the third', 'is this', 'is this the', 'one', 'second', 'second document', 'the', 'the first', 'the first document', 'the second', 'the second document', 'the third', 'the third one', 'third', 'third one', 'this', 'this document', 'this document is', 'this is', 'this is the', 'this the', 'this the first']


In [24]:
X.toarray()

array([[0.        , 0.        , 0.        , 0.25307077, 0.        ,
        0.        , 0.31259275, 0.31259275, 0.20690194, 0.25307077,
        0.39648427, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.20690194, 0.31259275,
        0.31259275, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.20690194, 0.        , 0.        ,
        0.31259275, 0.31259275, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.37077755, 0.29044734,
        0.29044734, 0.        , 0.        , 0.15156747, 0.18538877,
        0.        , 0.29044734, 0.        , 0.        , 0.        ,
        0.        , 0.29044734, 0.29044734, 0.15156747, 0.        ,
        0.        , 0.29044734, 0.29044734, 0.        , 0.        ,
        0.        , 0.        , 0.15156747, 0.29044734, 0.29044734,
        0.        , 0.        , 0.        , 0.        ],
       [0.29530082, 0.29530082, 0.29530082, 0.        , 0.        ,
  