In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame(data=[{'text':'Hello my name is Shivan ','output':1},
                        {'text':'I am from Bangalore ','output':1}, 
                        {'text':'Hello my name is Shivan ','output':1},
                       {'text':'i dont like alcohol','output':0}])
df

Unnamed: 0,text,output
0,Hello my name is Shivan,1
1,I am from Bangalore,1
2,Hello my name is Shivan,1
3,i dont like alcohol,0


In [3]:
cv = CountVectorizer()
bow = cv.fit_transform(df['text'])
bow

<4x11 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [4]:
cv.vocabulary_

{'hello': 5,
 'my': 8,
 'name': 9,
 'is': 6,
 'shivan': 10,
 'am': 1,
 'from': 4,
 'bangalore': 2,
 'dont': 3,
 'like': 7,
 'alcohol': 0}

**Explanation**

- It's the mapping from word to it's ID

- When calling fit on a vector each word will be mapped to the value in the dictionary.

- vocabulary_ is a dict where keys are terms and values are indices in the feature matrix.

CountVectorizer converts a collection of text documents to a matrix of token counts. It produces a sparse Matrix of the counts of each word from the vocabulary. The Matrix shape is NxM (N is the number of documents (rows) and M is the size of the vocabulary (columns)). This numbers are simply indices of each word of the vocabulary in this matrix across columns.

In [21]:
print(bow[0].toarray()) # Hello my name is Shivan -> We are going to check word avaliable in our vocabulary or not ( if avaliable means 1 else 0)
print(bow[1].toarray())
print(bow[2].toarray())
print(bow[3].toarray())
# 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 
# I am from Bangalore 


[[0 0 0 0 0 1 1 0 1 1 1]]
[[0 1 1 0 1 0 0 0 0 0 0]]
[[0 0 0 0 0 1 1 0 1 1 1]]
[[1 0 0 1 0 0 0 1 0 0 0]]


In [5]:
cv.vocabulary_

{'hello': 5,
 'my': 8,
 'name': 9,
 'is': 6,
 'shivan': 10,
 'am': 1,
 'from': 4,
 'bangalore': 2,
 'dont': 3,
 'like': 7,
 'alcohol': 0}

In [6]:
cv.transform(["hello my name is bcibuubdsiu is Shivan"]).toarray()


array([[0, 0, 0, 0, 0, 1, 2, 0, 1, 1, 1]], dtype=int64)

In [7]:
# Out of vocabluy solved 
# Binary True -> It will replace all frequency to one

cv = CountVectorizer(binary=True)
bow = cv.fit_transform(df['text'])
cv.transform(["hello my name is bcibuubdsiu is ketan"]).toarray()

array([[0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0]], dtype=int64)

In [None]:
# If we are setting binary = True 
# then we are saying instead of going for frequency check that word is present or not.

# max_features = build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.


## N - Grams

In [8]:
# 1,1 -> uni grams
# 2,2 -> Bi-grams
# 1,2 -> uni + bi 
# 1,3 -> uni + bi + tri 
cv = CountVectorizer(ngram_range=(1,5))
bow = cv.fit_transform(df['text'])

In [9]:
cv.vocabulary_

{'hello': 10,
 'my': 19,
 'name': 23,
 'is': 15,
 'shivan': 26,
 'hello my': 11,
 'my name': 20,
 'name is': 24,
 'is shivan': 16,
 'hello my name': 12,
 'my name is': 21,
 'name is shivan': 25,
 'hello my name is': 13,
 'my name is shivan': 22,
 'hello my name is shivan': 14,
 'am': 1,
 'from': 8,
 'bangalore': 4,
 'am from': 2,
 'from bangalore': 9,
 'am from bangalore': 3,
 'dont': 5,
 'like': 17,
 'alcohol': 0,
 'dont like': 6,
 'like alcohol': 18,
 'dont like alcohol': 7}

In [10]:
bow.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
        1, 1, 1, 1, 1],
       [0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
        1, 1, 1, 1, 1],
       [1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0]], dtype=int64)

## Tf - IDF


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
# In tf-idf we can implement N grams
tf = TfidfVectorizer()
bow = tf.fit_transform(df['text'])

In [13]:
bow.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.4472136 , 0.4472136 , 0.        , 0.4472136 , 0.4472136 ,
        0.4472136 ],
       [0.        , 0.57735027, 0.57735027, 0.        , 0.57735027,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.4472136 , 0.4472136 , 0.        , 0.4472136 , 0.4472136 ,
        0.4472136 ],
       [0.57735027, 0.        , 0.        , 0.57735027, 0.        ,
        0.        , 0.        , 0.57735027, 0.        , 0.        ,
        0.        ]])

In [14]:
tf.vocabulary_

{'hello': 5,
 'my': 8,
 'name': 9,
 'is': 6,
 'shivan': 10,
 'am': 1,
 'from': 4,
 'bangalore': 2,
 'dont': 3,
 'like': 7,
 'alcohol': 0}

In [15]:
tf.idf_

array([1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073,
       1.51082562, 1.51082562, 1.91629073, 1.51082562, 1.51082562,
       1.51082562])

In [16]:
bow[0].toarray()

array([[0.       , 0.       , 0.       , 0.       , 0.       , 0.4472136,
        0.4472136, 0.       , 0.4472136, 0.4472136, 0.4472136]])

# Happy Learning!