In [23]:
import nltk
import pandas as pd
import re
import string
from nltk.stem import WordNetLemmatizer 
  


In [24]:
# Reading the dataset
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)


In [25]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


## Stopwords-
In computing, stop words are words which are filtered out before or after processing of natural language data (text).
eg- the, this, are , is etc

## Lemmatization- 

The goal of both stemming and lemmatization is to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form. 
Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma.

Read more- 

https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html

https://www.datacamp.com/community/tutorials/stemming-lemmatization-python


In [26]:
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer() 

In [27]:
# Preprocessing
nltk.download('stopwords')
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Count Vectorization

Counts the words/token on the basis of the frequency(count) in a text.

Read more- https://www.educative.io/edpresso/countvectorizer-in-python

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
# Creating the Bag of Words model
cv = CountVectorizer(max_features = 2000)

#the X and y
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

In [30]:
from sklearn.model_selection import train_test_split
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 7)

In [31]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(750, 1766)
(750,)
(250, 1766)
(250,)


In [32]:
#using Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
# Random Forest
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 7)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [33]:
#accuracy score and classification report

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [34]:
accuracy_score(y_test, y_pred)

0.712

In [35]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.80      0.75       134
           1       0.72      0.61      0.66       116

    accuracy                           0.71       250
   macro avg       0.71      0.71      0.71       250
weighted avg       0.71      0.71      0.71       250



In [36]:
#doing a test prediction

test = ["the food was not very good, it was very rotten and tasted bad"]

In [37]:
#transforming for using on the model (using the count vectorizer)

test_vec = cv.transform(test)

In [38]:
#0= not liked
#1= liked the food 

classifier.predict(test_vec)[0]

0

In [39]:
#saving the model
import pickle

In [40]:
filename = 'reviews_classifier.sav'

In [41]:
pickle.dump(classifier, open(filename, 'wb'))

In [42]:
#saving the corpus

type(corpus)

list

In [43]:
with open('corpus.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(corpus, filehandle)