In [1]:
# # some common terms to remember
# 1.corpus
# 2.vocabulary
# 3.Document
# 4.Word


# **Text Vectorization Techniques**
# This notebook covers three fundamental text vectorization techniques used in natural language processing:




# 1. Bag of Words (BoW)
# The Bag of Words model represents text as a collection of words, ignoring grammar and word order but keeping track of word frequencies.

In [2]:
import numpy as np
import pandas as pd

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

df = pd.DataFrame({
    "text": ["people watch dswithbappy",
             "dswithbappy watch dswithbappy",
             "people write comment",
             "dswithbappy write comment"],
    "output": [1,1,0,0]
})
cv = CountVectorizer()

In [21]:
df

Unnamed: 0,text,output
0,people watch dswithbappy,1
1,dswithbappy watch dswithbappy,1
2,people write comment,0
3,dswithbappy write comment,0


In [22]:
bow = cv.fit_transform(df['text'])

In [24]:
#vocabulary
print(cv.vocabulary_)

{'people': 2, 'watch': 3, 'dswithbappy': 1, 'write': 4, 'comment': 0}


In [29]:
print(bow.toarray())

[[0 1 1 1 0]
 [0 2 0 1 0]
 [1 0 1 0 1]
 [1 1 0 0 1]]


In [30]:
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())
print(bow[3].toarray())

[[0 1 1 1 0]]
[[0 2 0 1 0]]
[[1 0 1 0 1]]
[[1 1 0 0 1]]


In [32]:
cv.transform(['Bappy watch dswithbappy']).toarray()

array([[0, 1, 0, 1, 0]])

In [4]:
texts = ["Data Science is fascinating and rewarding"]


In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(2,2))
X = vectorizer.fit_transform(texts)
print(vectorizer.get_feature_names_out())


['and rewarding' 'data science' 'fascinating and' 'is fascinating'
 'science is']


In [13]:
# trigram
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range = (3,3))
X = vectorizer.fit_transform(texts)
print(vectorizer.get_feature_names_out())

['data science is' 'fascinating and rewarding' 'is fascinating and'
 'science is fascinating']


In [33]:
X = bow.toarray()
y = df['output']

In [34]:
y

Unnamed: 0,output
0,1
1,1
2,0
3,0


# **TF-IDF (Term frequency- Inverse document frequency)**

In [37]:
df = pd.DataFrame({"text":["people watch dswithbappy",
                         "dswithbappy watch dswithbappy",
                         "people write comment",
                          "dswithbappy write comment"],"output":[1,1,0,0]})

df


Unnamed: 0,text,output
0,people watch dswithbappy,1
1,dswithbappy watch dswithbappy,1
2,people write comment,0
3,dswithbappy write comment,0


In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [55]:
arr = tfidf.fit_transform(df['text']).toarray()

In [56]:
arr

array([[0.        , 0.49681612, 0.61366674, 0.61366674, 0.        ],
       [0.        , 0.8508161 , 0.        , 0.52546357, 0.        ],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027],
       [0.61366674, 0.49681612, 0.        , 0.        , 0.61366674]])

In [57]:
print(tfidf.idf_)

[1.51082562 1.22314355 1.51082562 1.51082562 1.51082562]
