In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
s1 = 'It is a good practice for us.'
s2 = 'It was a also good to know about it'

In [3]:
s1_sep = word_tokenize(s1.lower())
s2_sep = word_tokenize(s2.lower())
s1_sep, s2_sep

(['it', 'is', 'a', 'good', 'practice', 'for', 'us', '.'],
 ['it', 'was', 'a', 'also', 'good', 'to', 'know', 'about', 'it'])

In [4]:
tk = s1_sep + s2_sep
tk = set(tk)
df = pd.DataFrame({}, index=[1,2], columns=list(tk))
df

Unnamed: 0,.,also,is,it,practice,us,about,to,a,was,know,good,for
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,


In [5]:
cnt1 = [s1_sep.count(x) for x in df.columns]
cnt2 = [s2_sep.count(x) for x in df.columns]
cnt1 , cnt2

([1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1],
 [0, 1, 0, 2, 0, 0, 1, 1, 1, 1, 1, 1, 0])

In [6]:
df.iloc[0,:] = cnt1
df.iloc[1,:] = cnt2
df

Unnamed: 0,.,also,is,it,practice,us,about,to,a,was,know,good,for
1,1,0,1,1,1,1,0,0,1,0,0,1,1
2,0,1,0,2,0,0,1,1,1,1,1,1,0


### 1. Count Vectorizer

In [7]:
cvt = CountVectorizer()
new_data = cvt.fit_transform([s1, s2])
new_data

<2x11 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [8]:
arr = new_data.toarray()
arr

array([[0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0],
       [1, 1, 0, 1, 0, 2, 1, 0, 1, 0, 1]], dtype=int64)

In [9]:
fea = cvt.get_feature_names_out()
fea

array(['about', 'also', 'for', 'good', 'is', 'it', 'know', 'practice',
       'to', 'us', 'was'], dtype=object)

In [10]:
df1 = pd.DataFrame(columns = fea, data = arr)
df1

Unnamed: 0,about,also,for,good,is,it,know,practice,to,us,was
0,0,0,1,1,1,1,0,1,0,1,0
1,1,1,0,1,0,2,1,0,1,0,1


### 2. N-grams

In [11]:
cvt = CountVectorizer(ngram_range=(1,2)) #min 1 word, max 2 words

In [12]:
new_data = cvt.fit_transform([s1, s2])
new_data.toarray()

array([[0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0,
        0],
       [1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 2, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1,
        1]], dtype=int64)

In [13]:
cvt.get_feature_names_out()

array(['about', 'about it', 'also', 'also good', 'for', 'for us', 'good',
       'good practice', 'good to', 'is', 'is good', 'it', 'it is',
       'it was', 'know', 'know about', 'practice', 'practice for', 'to',
       'to know', 'us', 'was', 'was also'], dtype=object)

In [14]:
df = pd.DataFrame(data=new_data.toarray(), columns=cvt.get_feature_names_out())
df

Unnamed: 0,about,about it,also,also good,for,for us,good,good practice,good to,is,...,it was,know,know about,practice,practice for,to,to know,us,was,was also
0,0,0,0,0,1,1,1,1,0,1,...,0,0,0,1,1,0,0,1,0,0
1,1,1,1,1,0,0,1,0,1,0,...,1,1,1,0,0,1,1,0,1,1


In [15]:
cvt = CountVectorizer(ngram_range=(2,2)) #min 2 words, max 2 words

new_data = cvt.fit_transform([s1, s2])
cvt.get_feature_names_out()

array(['about it', 'also good', 'for us', 'good practice', 'good to',
       'is good', 'it is', 'it was', 'know about', 'practice for',
       'to know', 'was also'], dtype=object)