## Feature Extraction using CountVecterizer & TFIDFVectorizer :-

In [1]:
doc1='f5o@od is # good & good!'
doc2='food is tasty'
doc3='Quality is Good'
doc4='food is not good'
doc5='Servi89ce is poor'
doc6='it is to_o costly'
doc7='Che^ap quality'

corpus=[doc1,doc2,doc3,doc4,doc5,doc6,doc7]
target=['pos','pos','pos','neg','neg','neg','neg']

In [2]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [3]:
sw=list(ENGLISH_STOP_WORDS)
print(len(sw))
sw.remove("not")

318


In [4]:
def text_cleaning(doc):
    doc=doc.lower()
    doc=re.sub('[^a-zA-Z ]','',doc)
    tokens=doc.split()
    newdoc=""
    for token in tokens:
        if token not in sw:
            newdoc=newdoc+token+" "
    return newdoc.strip()

In [5]:
final_corpus=list(map(text_cleaning,corpus))

In [6]:
final_corpus

['food good good',
 'food tasty',
 'quality good',
 'food not good',
 'service poor',
 'costly',
 'cheap quality']

### Feature Extraction
- process of extracting features from corpus
- sklearn provides 2 approaches
  - CountVectorizer
  - TfidfVectorizer
### Vectorization
- process of converting a document into array of numeric values

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
cv=CountVectorizer()
X=cv.fit_transform(final_corpus) #extract features as well as perform vectorization
print(X)           #sparse matrix representation of vectors

  (0, 2)	1
  (0, 3)	2
  (1, 2)	1
  (1, 8)	1
  (2, 3)	1
  (2, 6)	1
  (3, 2)	1
  (3, 3)	1
  (3, 4)	1
  (4, 7)	1
  (4, 5)	1
  (5, 1)	1
  (6, 6)	1
  (6, 0)	1


In [9]:
cv=CountVectorizer()
X=cv.fit_transform(final_corpus) #extract features as well as perform vectorization
print(cv.get_feature_names_out())
print(X.toarray()) #dense matrix representation of vectors
print(X)           #sparse matrix representation of vectors

['cheap' 'costly' 'food' 'good' 'not' 'poor' 'quality' 'service' 'tasty']
[[0 0 1 2 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 1]
 [0 0 0 1 0 0 1 0 0]
 [0 0 1 1 1 0 0 0 0]
 [0 0 0 0 0 1 0 1 0]
 [0 1 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 0]]
  (0, 2)	1
  (0, 3)	2
  (1, 2)	1
  (1, 8)	1
  (2, 3)	1
  (2, 6)	1
  (3, 2)	1
  (3, 3)	1
  (3, 4)	1
  (4, 7)	1
  (4, 5)	1
  (5, 1)	1
  (6, 6)	1
  (6, 0)	1


In [10]:
final_corpus

['food good good',
 'food tasty',
 'quality good',
 'food not good',
 'service poor',
 'costly',
 'cheap quality']

In [11]:
cv=CountVectorizer(ngram_range=(1,1)) # In this situation model may be underfitted, due to too low features
X=cv.fit_transform(final_corpus) #extract features as well as perform vectorization
print(cv.get_feature_names_out())

['cheap' 'costly' 'food' 'good' 'not' 'poor' 'quality' 'service' 'tasty']


In [12]:
cv=CountVectorizer(ngram_range=(1,2))
X=cv.fit_transform(final_corpus) #extract features as well as perform vectorization
print(cv.get_feature_names_out())

['cheap' 'cheap quality' 'costly' 'food' 'food good' 'food not'
 'food tasty' 'good' 'good good' 'not' 'not good' 'poor' 'quality'
 'quality good' 'service' 'service poor' 'tasty']


In [13]:
cv=CountVectorizer(ngram_range=(2,2))
X=cv.fit_transform(final_corpus) #extract features as well as perform vectorization
print(cv.get_feature_names_out())

['cheap quality' 'food good' 'food not' 'food tasty' 'good good'
 'not good' 'quality good' 'service poor']


In [16]:
cv=CountVectorizer(ngram_range=(1,3)) # In this situation model may be overfitted, due to too many features
X=cv.fit_transform(final_corpus) #extract features as well as perform vectorization
print(cv.get_feature_names_out())

['cheap' 'cheap quality' 'costly' 'food' 'food good' 'food good good'
 'food not' 'food not good' 'food tasty' 'good' 'good good' 'not'
 'not good' 'poor' 'quality' 'quality good' 'service' 'service poor'
 'tasty']


In [17]:
cv=CountVectorizer(ngram_range=(1,3),max_features=15)
X=cv.fit_transform(final_corpus) #extract features as well as perform vectorization
print(cv.get_feature_names_out())

['cheap' 'food' 'food good' 'food good good' 'food not' 'food not good'
 'food tasty' 'good' 'good good' 'not' 'not good' 'poor' 'quality'
 'quality good' 'service']


In [18]:
final_corpus

['food good good',
 'food tasty',
 'quality good',
 'food not good',
 'service poor',
 'costly',
 'cheap quality']

In [19]:
cv=CountVectorizer(ngram_range=(1,3),max_features=None,min_df=3)
X=cv.fit_transform(final_corpus) #extract features as well as perform vectorization
print(cv.get_feature_names_out())

['food' 'good']


In [20]:
cv=CountVectorizer(ngram_range=(1,3),max_features=None,min_df=1)
X=cv.fit_transform(final_corpus) #extract features as well as perform vectorization
print(cv.get_feature_names_out())
print(X.toarray())

['cheap' 'cheap quality' 'costly' 'food' 'food good' 'food good good'
 'food not' 'food not good' 'food tasty' 'good' 'good good' 'not'
 'not good' 'poor' 'quality' 'quality good' 'service' 'service poor'
 'tasty']
[[0 0 0 1 1 1 0 0 0 2 1 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0]
 [0 0 0 1 0 0 1 1 0 1 0 1 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]]


In [21]:
cv=CountVectorizer(ngram_range=(1,3),max_features=None,min_df=1,binary=True)
# binary=True parameters are used for scaling, only show presence or abscence
X=cv.fit_transform(final_corpus) #extract features as well as perform vectorization
print(cv.get_feature_names_out())
print(X.toarray())

['cheap' 'cheap quality' 'costly' 'food' 'food good' 'food good good'
 'food not' 'food not good' 'food tasty' 'good' 'good good' 'not'
 'not good' 'poor' 'quality' 'quality good' 'service' 'service poor'
 'tasty']
[[0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0]
 [0 0 0 1 0 0 1 1 0 1 0 1 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]]


In [22]:
cv=CountVectorizer(ngram_range=(1,3),max_features=None,min_df=1,binary=True,
                   lowercase=True,stop_words=sw)
X=cv.fit_transform(final_corpus) #extract features as well as perform vectorization
print(cv.get_feature_names_out())
print(X.toarray())

['cheap' 'cheap quality' 'costly' 'food' 'food good' 'food good good'
 'food not' 'food not good' 'food tasty' 'good' 'good good' 'not'
 'not good' 'poor' 'quality' 'quality good' 'service' 'service poor'
 'tasty']
[[0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0]
 [0 0 0 1 0 0 1 1 0 1 0 1 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]]


In [23]:
def text_cleaning(doc):
    doc=re.sub('[^a-zA-Z ]','',doc)
    return doc

In [24]:
final_corpus=list(map(text_cleaning,corpus))

In [25]:
final_corpus

['food is  good  good',
 'food is tasty',
 'Quality is Good',
 'food is not good',
 'Service is poor',
 'it is too costly',
 'Cheap quality']

In [26]:
cv=CountVectorizer(ngram_range=(1,3),max_features=None,min_df=1,binary=True,lowercase=True,
                   stop_words=sw)
X=cv.fit_transform(final_corpus) #extract features as well as perform vectorization
print(cv.get_feature_names_out())
print(X.toarray())

['cheap' 'cheap quality' 'costly' 'food' 'food good' 'food good good'
 'food not' 'food not good' 'food tasty' 'good' 'good good' 'not'
 'not good' 'poor' 'quality' 'quality good' 'service' 'service poor'
 'tasty']
[[0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0]
 [0 0 0 1 0 0 1 1 0 1 0 1 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]]


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
#it is normalized version of countvectorizer
tv=TfidfVectorizer(lowercase=True,stop_words=sw)
X=tv.fit_transform(final_corpus) #extract features as well as perform vectorization
print(tv.get_feature_names_out())
print(X.toarray())

['cheap' 'costly' 'food' 'good' 'not' 'poor' 'quality' 'service' 'tasty']
[[0.         0.         0.4472136  0.89442719 0.         0.
  0.         0.         0.        ]
 [0.         0.         0.57866699 0.         0.         0.
  0.         0.         0.81556393]
 [0.         0.         0.         0.64974959 0.         0.
  0.76014832 0.         0.        ]
 [0.         0.         0.5008545  0.5008545  0.70589627 0.
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.70710678
  0.         0.70710678 0.        ]
 [0.         1.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.76944876 0.         0.         0.         0.         0.
  0.63870855 0.         0.        ]]


In [29]:
import pandas as pd

In [30]:
df=pd.read_csv("Restaurant_Reviews.tsv",sep='\t')
df

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [31]:
corpus=df.Review
target=df.Liked

In [32]:
final_corpus=list(map(text_cleaning,corpus))

In [34]:
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB

In [35]:
cv=CountVectorizer(lowercase=True,stop_words=sw)
X=cv.fit_transform(final_corpus).toarray()
y=target
model=MultinomialNB() #best suitable with countvectorizer
model.fit(X,y)
model.score(X,y)

0.956

In [36]:
cv=TfidfVectorizer(lowercase=True,stop_words=sw)
X=cv.fit_transform(final_corpus).toarray()
y=target
model=GaussianNB() #best suitable with tfidfvectrozer
model.fit(X,y)
model.score(X,y)

0.945

In [37]:
cv=CountVectorizer(lowercase=True,stop_words=sw,binary=True)
X=cv.fit_transform(final_corpus).toarray()
y=target
model=BernoulliNB() #best suitable with countvectorizer binary=True
model.fit(X,y)
model.score(X,y)

0.953