In [None]:
import pandas as pd
import numpy as np

In [None]:
path = '/content/Restaurant_Reviews.tsv'
df = pd.read_csv(path, delimiter = '\t')

#Exploratory Data Analyis

In [None]:
df

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [None]:
df['Liked'].value_counts()

1    500
0    500
Name: Liked, dtype: int64

#Data Cleanup

In [None]:
df.isna().sum()

Review    0
Liked     0
dtype: int64

#*Lower case every review for normalization and reduction of vocabulary size*

In [None]:
df['Review'] = df['Review'].str.lower()

#*Removing Stop Words*

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
#Tokenization
df['Review'] = df['Review'].apply(nltk.word_tokenize)

#Removing Stopwords
stop_words = set(stopwords.words('english'))
df['Review'] = df['Review'].apply(lambda x: [word for word in x if word.lower() not in stop_words])

In [None]:
len(stop_words)

179

In [None]:
#Join back the words

df['Review'] = df['Review'].apply(lambda x: ' '.join(x))

In [None]:
df.head(5)

Unnamed: 0,Review,Liked
0,wow ... loved place .,1
1,crust good .,0
2,tasty texture nasty .,0
3,stopped late may bank holiday rick steve recom...,1
4,selection menu great prices .,1


#Remove Punctutations

In [None]:
print(df.iloc[3,0])

stopped late may bank holiday rick steve recommendation loved .


In [None]:
# Function to remove punctuation
import string
def remove_punctuation(text):
  return text.translate(str.maketrans('', '', string.punctuation))

In [None]:
# Apply the function on the column 'Review'
df['Review'] = df['Review'].apply(remove_punctuation)

In [None]:
print(df.iloc[3,0])

stopped late may bank holiday rick steve recommendation loved 


#Apply Lemmatization & Stemming


In [None]:
from nltk.stem import WordNetLemmatizer   # For Lemmatization
from nltk.stem import PorterStemmer       # For Stemming

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Creating a function for lemmatization

lemmatizer = WordNetLemmatizer()

def func_lemmatizer(sentence):
  return ' '.join([lemmatizer.lemmatize(word) for word in sentence.split()])

In [None]:
# Apply lemmatization on 'Review' and make a new column for Lemmatized Text

df['Review_Lemma'] = df['Review'].apply(func_lemmatizer)

In [None]:
# Creating a function for stemming

stemmer = PorterStemmer()
def func_stemming(sentence):
  return ' '.join([stemmer.stem(word) for word in sentence.split()])

In [None]:
# Apply Stemming on 'Review' and make a new column for Stemmed Text

df['Review_Stem'] = df['Review'].apply(func_stemming)

#Apply TF/IDF & Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

originalText = df['Review'].tolist()
stemText = df['Review_Stem'].tolist()
lemmaText = df['Review_Lemma'].tolist()

print(type(originalText))
print(type(stemText))
print(type(lemmaText))

<class 'list'>
<class 'list'>
<class 'list'>


In [None]:
df.columns

Index(['Review', 'Liked', 'Review_Lemma', 'Review_Stem'], dtype='object')

In [None]:
import re

In [None]:
corpusO = []
corpusS = []
corpusL = []

In [None]:
for i in range(0,1000):
  review = re.sub('[^a-zA-Z]', ' ',  df.Review[i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpusO.append(review)

In [None]:
for i in range(0,1000):
  review = re.sub('[^a-zA-Z]', ' ',  df.Review_Stem[i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpusS.append(review)

In [None]:
for i in range(0,1000):
  review = re.sub('[^a-zA-Z]', ' ',  df.Review_Lemma[i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpusL.append(review)

In [None]:
# tfidf_vectorizer = TfidfVectorizer()

# tfidf_matrix = tfidf_vectorizer.fit_transform(originalText)

# tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf_vectorizer.get_features)

#Count Vectorizer

In [None]:
df

Unnamed: 0,Review,Liked,Review_Lemma,Review_Stem
0,wow loved place,1,wow loved place,wow love place
1,crust good,0,crust good,crust good
2,tasty texture nasty,0,tasty texture nasty,tasti textur nasti
3,stopped late may bank holiday rick steve recom...,1,stopped late may bank holiday rick steve recom...,stop late may bank holiday rick steve recommen...
4,selection menu great prices,1,selection menu great price,select menu great price
...,...,...,...,...
995,think food flavor texture lacking,0,think food flavor texture lacking,think food flavor textur lack
996,appetite instantly gone,0,appetite instantly gone,appetit instantli gone
997,overall impressed would go back,0,overall impressed would go back,overal impress would go back
998,whole experience underwhelming think ll go ni...,0,whole experience underwhelming think ll go nin...,whole experi underwhelm think ll go ninja sush...


#Original Text

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpusO).toarray()
y = df.iloc[:, 1:2].values

In [None]:
X

In [None]:
y = y.flatten()

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[55 42]
 [12 91]]


0.73

#Stemmed Text

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpusS).toarray()
y = df.iloc[:, 1:2].values

In [None]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
y = y.flatten()

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[55 42]
 [12 91]]


0.73

#Lemmatized Text

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpusL).toarray()
y = df.iloc[:, 1:2].values

In [None]:
X

In [None]:
y = y.flatten()

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[55 42]
 [12 91]]


0.73