#### Data Science Capstone Project,  Springboard Bootcamp <br> Title: "Improving Restaurant Reputation Using Yelp User Reviews" <br> Reza Taeb <br> San Francisco, Spring 2018 

## Part 4 - Machine Learning

In [1]:
# Importing Necessary Packages and Libraries

import pandas as pd 
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Adjust output view   # May be it can be deleted 

pd.set_option('display.width', 115)
pd.options.display.max_colwidth = 30
sns.set()

First, Let's load the restaurant and review datasets and review datasets that have been modified in the previous parts.    

*** (" restaurant.csv & restaurant_eng.csv & review_restaurant_eng.csv ") ***

In [None]:
# read cleaned CSV files ("review_restaurant_eng", "restaurant_eng") 

df_review_restaurant_eng_processed = pd.read_csv('../review_restaurant_eng_processed.csv')
df_review_restaurant_eng_small_processed = pd.read_csv('../review_restaurant_eng_small_processed.csv')
df_restaurant_eng = pd.read_csv('../restaurant_eng.csv')

In [None]:
# General Information of the two datasets : 

print (df_restaurant_eng.info())
print (df_review_restaurant_eng_processed.info())
print (df_review_restaurant_eng_small_processed.info())

In [None]:
# 

print (df_review_restaurant_eng_small_processed.head(5))


## 3 & 4 stars

From here, I am trying to figure out whether we can distinguish the 3 stars and 4 stars reviews by just checking the “text” of reviews or not. Therfore, I am going to focus just on the 3 and 4 stars reviews. 

In [None]:
# Filtering the 3 and 4 star reviews: 

three_four_star_restaurants = df_review_restaurant_eng_small_processed[df_review_restaurant_eng_small_processed['stars'].isin(['4','3'])]

In [None]:
# Check the 3 & 4 star ratings : 

three_four_star_restaurants.info()
print (three_four_star_restaurants.head(5))
print (three_four_star_restaurants['stars'].value_counts())

Since there are almost double 4 star entires than 3 star entries, it's better to make the size of them equal (** Under Sampling **) before going through ML algorithm :

In [None]:
# Make an equal sample of 3 and 4 star entries 

no_three_star = len(three_four_star_restaurants[three_four_star_restaurants['stars']==3])
four_star_indices =  three_four_star_restaurants[three_four_star_restaurants.stars == 4].index

# Random sample of "4 star" ratings 

random_indices = np.random.choice(four_star_indices, no_three_star , replace=False)
three_star_indices = three_four_star_restaurants[three_four_star_restaurants.stars == 3].index

# Concat 3 stars indices with 4 star ones

under_sample_indices = np.concatenate([three_star_indices,random_indices])

# Get Balance Dataframe

three_four_star_restaurants_balanced = three_four_star_restaurants.loc[under_sample_indices]

# check it out 

print (three_four_star_restaurants_balanced['stars'].value_counts())

In [None]:
# Define text processing function 

def text_process(text):
    '''
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Return the cleaned text as a list of words
    '''
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [None]:
# Define X and y for further steps (Dependent adn Independent variables)

X = three_four_star_restaurants_balanced['word_list']
y = three_four_star_restaurants_balanced['stars']

In [None]:
# Vectorisation 

bow_transformer = CountVectorizer().fit(X)
X = bow_transformer.transform(X)

In [None]:
# Split the dataset into training and test sets 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

** Multinomial Naive Bayes ** is a specialised version of Naive Bayes designed more for text documents. Let’s build a Multinomial Naive Bayes model and fit it to our training set (X_train and y_train).

In [None]:
# Training our model

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)

Our model has now been trained! It’s time to see how well it predicts the ratings of previously unseen reviews (reviews from the test set). First, let’s store the predictions as a separate dataframe called ** predicts. **

In [None]:
# Testing and evaluating our model

predicts = nb.predict(X_test)

Next, let’s evaluate our predictions against the actual ratings (stored in y_test) using confusion_matrix and classification_report from Scikit-learn.

In [None]:
# Evaluate the predictions against actual ratings 

from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, predicts))
print('\n \n')
print ('        Classification Report (3 and 4 stars reviews)')
print('\n ')
print(classification_report(y_test, predicts))

## 4 & 5 stars

In [None]:
# Filtering the 4 and 5 star reviews: 

four_five_star_restaurants = df_review_restaurant_eng_small_processed[df_review_restaurant_eng_small_processed['stars'].isin(['4','5'])]

In [None]:
# Check the 4 & 5 star ratings : 

print (four_five_star_restaurants['stars'].value_counts())

Balancing ( ** undersampling ** ) the 4 and 5 star ratings entries: 

In [None]:
# Make an equal sample of 4 and 5 star entries 

no_four_star = len(four_five_star_restaurants[four_five_star_restaurants['stars']==4])
five_star_indices =  four_five_star_restaurants[four_five_star_restaurants.stars == 5].index

# Random sample of "5 star" ratings 

random_indices = np.random.choice(five_star_indices, no_four_star , replace=False)
four_star_indices = four_five_star_restaurants[four_five_star_restaurants.stars == 4].index

# Concat 3 stars indices with 4 star ones

under_sample_indices = np.concatenate([four_star_indices,random_indices])

# Get Balance Dataframe

four_five_star_restaurants_balanced = four_five_star_restaurants.loc[under_sample_indices]

# check it out 

print (four_five_star_restaurants_balanced['stars'].value_counts())

In [None]:
# Define X and y for further steps 

X = four_five_star_restaurants_balanced['word_list']
y = four_five_star_restaurants_balanced['stars']

In [None]:
# Vectorisation 

bow_transformer = CountVectorizer().fit(X)
X = bow_transformer.transform(X)

In [None]:
# Split the dataset into training and test sets 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
# Training our model

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)

In [None]:
# Testing and evaluating our model

predicts = nb.predict(X_test)

In [None]:
# Evaluate the predictions against actual ratings 

from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, predicts))
print('\n \n')
print ('        Classification Report (4 and 5 stars reviews)')
print('\n ')
print(classification_report(y_test, predicts))

## 3 & 5 stars

In [None]:
# Filtering the 3 and 5 star reviews: 

three_five_star_restaurants = df_review_restaurant_eng_small_processed[df_review_restaurant_eng_small_processed['stars'].isin(['3','5'])]

In [None]:
# Check the 3 & 5 star ratings : 

print (three_five_star_restaurants['stars'].value_counts())

Balancing ( ** undersampling ** ) the 3 and 5 star ratings entries: 

In [None]:
# Make an equal sample of 3 and 5 star entries 

no_three_star = len(three_five_star_restaurants[three_five_star_restaurants['stars']==3])
five_star_indices =  three_five_star_restaurants[three_five_star_restaurants.stars == 5].index

# Random sample of "5 star" ratings 

random_indices = np.random.choice(five_star_indices, no_three_star , replace=False)
three_star_indices = three_five_star_restaurants[three_five_star_restaurants.stars == 3].index

# Concat 3 stars indices with 4 star ones

under_sample_indices = np.concatenate([three_star_indices,random_indices])

# Get Balance Dataframe

three_five_star_restaurants_balanced = three_five_star_restaurants.loc[under_sample_indices]

# check it out 

print (three_five_star_restaurants_balanced['stars'].value_counts())

In [None]:
# Define X and y for further steps 

X = three_five_star_restaurants_balanced['word_list']
y = three_five_star_restaurants_balanced['stars']

In [None]:
# Vectorisation 

bow_transformer = CountVectorizer().fit(X)
X = bow_transformer.transform(X)

In [None]:
# Split the dataset into training and test sets 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
# Training our model

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)

In [None]:
# Testing and evaluating our model

predicts = nb.predict(X_test)

In [None]:
# Evaluate the predictions against actual ratings 

from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, predicts))
print('\n \n')
print ('        Classification Report (3 and 5 stars reviews)')
print('\n ')
print(classification_report(y_test, predicts))