## NLP - Sentiment Analysis on Restaurant Reviews

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
#load the dataset
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

In [6]:
#clean the text
import re
import nltk
nltk.download("stopwords") #download stop words
from nltk.corpus import stopwords #import stop words
# to apply stemming on reviews (only takes the root of the word meaning)
# it reduces the final dimensions of the sparse matrix
from nltk.stem.porter import PorterStemmer 

corpus = [] # this will be a list of cleaned text

for i in range(0, 1000):
  review = re.sub("[^a-zA-Z]", " ", dataset["Review"][i]) # replace punctuation with spaces ^ means NOT
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words("english")
  all_stopwords.remove("not")
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = " ".join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
# creating bag of words model

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1566) #input after running below cell
x = cv.fit_transform(corpus).toarray() # matrix has to be 2d array
y = dataset.iloc[:, -1].values

In [14]:
# check how many words are in our "bag"
len(x[0])

1566

In [16]:
# split the data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [45]:
# a number of classification models can be used here
# with out using any data preprocessing LR gets the highest 
# accuracy of 77%
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(x_train, y_train)

LogisticRegression(random_state=0)

In [None]:
# predict the test set results
y_pred = classifier.predict(x_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [47]:
# create a confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[82 14]
 [31 73]]


0.775