In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize

In [2]:
sentence_1 = "It is a good practice for us."
sentence_2 = "It was also good to know about it"

# Using word_tokenize

In [5]:
# Concat the sents and applly word tokenize:

token = word_tokenize(sentence_1.lower()) + word_tokenize(sentence_2.lower())
token

['it',
 'is',
 'a',
 'good',
 'practice',
 'for',
 'us',
 '.',
 'it',
 'was',
 'also',
 'good',
 'to',
 'know',
 'about',
 'it']

In [8]:
# remove duplication
token = set(token)
token

{'.',
 'a',
 'about',
 'also',
 'for',
 'good',
 'is',
 'it',
 'know',
 'practice',
 'to',
 'us',
 'was'}

In [10]:
# create df:
# take col as unique words

df = pd.DataFrame({}, index = [1,2], columns=list(token))
df

Unnamed: 0,know,practice,it,us,.,was,for,is,a,also,about,to,good
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,


In [11]:
token1 = word_tokenize(sentence_1.lower())
token2 = word_tokenize(sentence_1.lower())

In [12]:
# calculate the count:
count1 = [token1.count(x) for x in df.columns]
count2 = [token2.count(x) for x in df.columns]

In [13]:
df.iloc[0,:] = count1
df.iloc[1,:] = count2

In [14]:
df

Unnamed: 0,know,practice,it,us,.,was,for,is,a,also,about,to,good
1,0,1,1,1,1,0,1,1,1,0,0,0,1
2,0,1,1,1,1,0,1,1,1,0,0,0,1


# using countvector

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [24]:
# create obj
cvt = CountVectorizer()

new_data = cvt.fit_transform([sentence_1,sentence_2])

new_data.toarray()

array([[0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0],
       [1, 1, 0, 1, 0, 2, 1, 0, 1, 0, 1]], dtype=int64)

In [25]:
# print features name:

col_name = cvt.get_feature_names_out()
col_name

array(['about', 'also', 'for', 'good', 'is', 'it', 'know', 'practice',
       'to', 'us', 'was'], dtype=object)

In [26]:
# create df:

new_df = pd.DataFrame(data = new_data.toarray(), columns=list(col_name))
new_df

Unnamed: 0,about,also,for,good,is,it,know,practice,to,us,was
0,0,0,1,1,1,1,0,1,0,1,0
1,1,1,0,1,0,2,1,0,1,0,1


# N grams:

In [28]:
# create obj:
cvt = CountVectorizer()

new_data = cvt.fit_transform([sentence_1, sentence_2])
new_data.toarray()

array([[0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0],
       [1, 1, 0, 1, 0, 2, 1, 0, 1, 0, 1]], dtype=int64)

In [29]:
df_cvt = pd.DataFrame(data = new_data.toarray(), columns=cvt.get_feature_names_out())
df_cvt

Unnamed: 0,about,also,for,good,is,it,know,practice,to,us,was
0,0,0,1,1,1,1,0,1,0,1,0
1,1,1,0,1,0,2,1,0,1,0,1


# For Bi-gram:

In [30]:
cvt = CountVectorizer(ngram_range= (2,2))
new_data = cvt.fit_transform([sentence_1, sentence_2])

df_cvt = pd.DataFrame(data = new_data.toarray(), columns=cvt.get_feature_names_out())
df_cvt

Unnamed: 0,about it,also good,for us,good practice,good to,is good,it is,it was,know about,practice for,to know,was also
0,0,0,1,1,0,1,1,0,0,1,0,0
1,1,1,0,0,1,0,0,1,1,0,1,1


# For tri- gram

In [31]:
cvt = CountVectorizer(ngram_range= (3,3))
new_data = cvt.fit_transform([sentence_1, sentence_2])

df_cvt = pd.DataFrame(data = new_data.toarray(), columns=cvt.get_feature_names_out())
df_cvt

Unnamed: 0,also good to,good practice for,good to know,is good practice,it is good,it was also,know about it,practice for us,to know about,was also good
0,0,1,0,1,1,0,0,1,0,0
1,1,0,1,0,0,1,1,0,1,1
