# Importing The Dependencies

In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import bigrams,trigrams,ngrams
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer,LancasterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix,classification_report,ConfusionMatrixDisplay

# Data Collection & Processing

In [51]:
## loading tha data from tsv file and converted to csv file to a pandas DataFrame

Review_Dataset=pd.read_csv(r"Documents\Restaurant_Reviews.tsv",delimiter="\t")


In [52]:
Review_Dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [53]:
## finding the Rows & Columns

Review_Dataset.shape

(1000, 2)

# Data Cleaning & Split The Data 

In [54]:
corpus=[]

In [55]:
for i in range(0,1000):
    review=re.sub("[a-zA-Z]"," ",Review_Dataset["Review"][i])                  ## special characters removed
    
    review=review.lower()                                       ## all words converted to lowercase
                  
    review=review.split()                                       ## split the seperate word by space
    
    ps=PorterStemmer()
    
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]          ## not steming words removed using stopwords
    
    review=" ".join(review)                                     ## list converted to string
    
    corpus.append(review)
    
                  

# All Words Converted As a Number

In [56]:
cv=CountVectorizer(max_features=1500)
x=cv.fit_transform(corpus)
y=Review_Dataset.iloc[:,1].values


In [57]:
## splitting the data into training and test for model building and prediction 


x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)

In [58]:
y_train.shape

(750,)

# import the random classifier model

In [70]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(n_estimators=1000,criterion="entropy")
model.fit(x_train,y_train)

RandomForestClassifier(criterion='entropy', n_estimators=1000)

In [71]:
(394)/750

0.5253333333333333

In [74]:
prediction=model.predict(x_train)
print(confusion_matrix(y_train,prediction))
print('\n')
print(classification_report(y_train,prediction))

[[390   0]
 [356   4]]


              precision    recall  f1-score   support

           0       0.52      1.00      0.69       390
           1       1.00      0.01      0.02       360

    accuracy                           0.53       750
   macro avg       0.76      0.51      0.35       750
weighted avg       0.75      0.53      0.37       750

