In [1]:
import pandas as pd
import numpy as np

## BoW (bag of words)

In [2]:
data = {
    "text": [
        "I love the new design of your website",         
        "The product is terrible and disappointing",     
        "Amazing experience, I will come again",         
        "Very bad service and rude staff",               
        "I am happy with the quick delivery"],
    "label": [1,0,1,0,1]
}

df = pd.DataFrame(data)
df

Unnamed: 0,text,label
0,I love the new design of your website,1
1,The product is terrible and disappointing,0
2,"Amazing experience, I will come again",1
3,Very bad service and rude staff,0
4,I am happy with the quick delivery,1


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [4]:
bow = cv.fit_transform(df['text'])

In [5]:
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())

[[0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 1 0 0 1]]
[[0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0]]
[[1 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]]


In [6]:
#problem of unseen words is handled in bow! it ignores the words is not in the trained vocabulory
ex = cv.transform(['raj is happy,because he love this design'])
ex[0].toarray()

array([[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0]], dtype=int64)

## N-Grams

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
#it works like i love data
#bigram range (1,2) = ['i','love','data','i love','love data'] combination of bow and bigram
#bigram range (2,2) = ['i love','love data'] bigram
#if we tell the range (4,4) but the sentence dont have 4 words the vocabulaory will not be created
ngrams = CountVectorizer(ngram_range=(2,2))
bigrams = ngrams.fit_transform(df['text'])
print(ngrams.vocabulary_)

{'love the': 10, 'the new': 18, 'new design': 11, 'design of': 6, 'of your': 12, 'your website': 24, 'the product': 19, 'product is': 13, 'is terrible': 9, 'terrible and': 17, 'and disappointing': 2, 'amazing experience': 1, 'experience will': 7, 'will come': 22, 'come again': 5, 'very bad': 21, 'bad service': 4, 'service and': 16, 'and rude': 3, 'rude staff': 15, 'am happy': 0, 'happy with': 8, 'with the': 23, 'the quick': 20, 'quick delivery': 14}


In [8]:
bigrams[0].toarray()

array([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 1]], dtype=int64)

#### Trigrams


In [9]:
grams = CountVectorizer(ngram_range=(3,4))
trigrams = grams.fit_transform(df['text'])
print(grams.vocabulary_)

{'love the new': 15, 'the new design': 25, 'new design of': 17, 'design of your': 7, 'of your website': 19, 'love the new design': 16, 'the new design of': 26, 'new design of your': 18, 'design of your website': 8, 'the product is': 27, 'product is terrible': 20, 'is terrible and': 13, 'terrible and disappointing': 24, 'the product is terrible': 28, 'product is terrible and': 21, 'is terrible and disappointing': 14, 'amazing experience will': 2, 'experience will come': 9, 'will come again': 32, 'amazing experience will come': 3, 'experience will come again': 10, 'very bad service': 30, 'bad service and': 5, 'service and rude': 22, 'and rude staff': 4, 'very bad service and': 31, 'bad service and rude': 6, 'service and rude staff': 23, 'am happy with': 0, 'happy with the': 11, 'with the quick': 33, 'the quick delivery': 29, 'am happy with the': 1, 'happy with the quick': 12, 'with the quick delivery': 34}


In [10]:
trigrams[0].toarray()

array([[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
        0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

## TF-IDF

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
tfidf = TfidfVectorizer()

In [39]:
tfidf.fit_transform(df['text']).toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.39379499, 0.        , 0.        ,
        0.        , 0.        , 0.39379499, 0.39379499, 0.39379499,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.26372909, 0.        , 0.39379499, 0.        ,
        0.        , 0.39379499],
       [0.        , 0.        , 0.        , 0.35727423, 0.        ,
        0.        , 0.        , 0.        , 0.4428322 , 0.        ,
        0.        , 0.4428322 , 0.        , 0.        , 0.        ,
        0.4428322 , 0.        , 0.        , 0.        , 0.        ,
        0.4428322 , 0.29656989, 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.4472136 , 0.        , 0.4472136 , 0.        , 0.        ,
        0.4472136 , 0.        , 0.        , 0.        , 0.4472136 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.    

In [45]:
tf_df = pd.DataFrame({
    'words':tfidf.get_feature_names_out(),
    'tf-idf':tfidf.idf_
})

In [46]:
tf_df

Unnamed: 0,words,tf-idf
0,again,2.098612
1,am,2.098612
2,amazing,2.098612
3,and,1.693147
4,bad,2.098612
5,come,2.098612
6,delivery,2.098612
7,design,2.098612
8,disappointing,2.098612
9,experience,2.098612
