# Predicting Ratings from Review Text

In [1]:
%matplotlib inline
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
import datetime
from glob import glob

import seaborn as sns
sns.set_style('whitegrid')
sns.set_context("poster")

In [2]:
def time_marker(text=''):
    print('[{}] {}'.format(datetime.datetime.now().time(), text))

In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

import matplotlib
font = {'size' : 50}
matplotlib.rc('font', **font)

TITLE_FONT_SIZE = 25
LABEL_FONT_SIZE = 15
TICK_FONT_SIZE  = 15

FIG_SIZE = (15,6)
FIG_SIZE_SHORT = (15,3)

# Load Reviews

## All Reviews

In [4]:
time_marker('loading all reviews...')
reviews = pd.read_csv('./clean_data/_analysis/restaurant_reviews_top_countries.csv', index_col=0, parse_dates=['date'])
time_marker('done')

[17:28:25.968409] loading all reviews...


  mask |= (ar1 == a)


[17:28:50.371656] done


## Reviews by Country

In [5]:
time_marker('subsetting United States Reviews.. ')
us_reviews = reviews[reviews.country == 'United States'].copy()
us_reviews.reset_index(inplace=True, drop=True)

time_marker('subsetting Canada Reviews.. ')
can_reviews = reviews[reviews.country == 'Canada'].copy()
can_reviews.reset_index(inplace=True, drop=True)

time_marker('subsetting Germany Reviews.. ')
ger_reviews = reviews[reviews.country == 'Germany'].copy()
ger_reviews.reset_index(inplace=True, drop=True)

time_marker('subsetting United Kingdom Reviews.. ')
uk_reviews = reviews[reviews.country == 'United Kingdom'].copy()
uk_reviews.reset_index(inplace=True, drop=True)

time_marker('done!')

[17:28:50.402741] subsetting United States Reviews.. 
[17:28:51.066582] subsetting Canada Reviews.. 
[17:28:51.331676] subsetting Germany Reviews.. 
[17:28:51.478187] subsetting United Kingdom Reviews.. 
[17:28:51.629948] done!


In [6]:
print('{} {:2.4f} reviews from All Countries'.format( str(reviews.shape[0]).rjust(9),     reviews.shape[0]/reviews.shape[0]*100.))
print('{}  {:2.4f} reviews from United States'.format( str(us_reviews.shape[0]).rjust(9),  us_reviews.shape[0]/reviews.shape[0]*100.))
print('{}  {:2.4f} reviews from Canada'.format(        str(can_reviews.shape[0]).rjust(9), can_reviews.shape[0]/reviews.shape[0]*100.))
print('{}   {:2.4f} reviews from Germany'.format(       str(ger_reviews.shape[0]).rjust(9), ger_reviews.shape[0]/reviews.shape[0]*100.))
print('{}   {:2.4f} reviews from United Kingdom'.format(str(uk_reviews.shape[0]).rjust(9),  uk_reviews.shape[0]/reviews.shape[0]*100.))

  1676714 100.0000 reviews from All Countries
  1323672  78.9444 reviews from United States
   327956  19.5594 reviews from Canada
    13261   0.7909 reviews from Germany
    11825   0.7052 reviews from United Kingdom


# Labels and Features

In [7]:
# features - the text of the review
texts_all = list(reviews['text'])
texts_us = list(us_reviews['text'])
texts_can = list(can_reviews['text'])
texts_ger = list(ger_reviews['text'])
texts_uk = list(uk_reviews['text'])

# labels - the star rating for each review
stars_all = list(reviews['stars'])
stars_us = list(us_reviews['stars'])
stars_can = list(can_reviews['stars'])
stars_ger = list(ger_reviews['stars'])
stars_uk = list(uk_reviews['stars'])


# Balance Data
<p>take equal number of reviews form each star rating, helps avoid bias towards ratings with more reviews</p>

In [8]:
from collections import Counter
 
def balance_classes(xs, ys):
    """Undersample xs, ys to balance classes."""
    freqs = Counter(ys)

    # the least common class is the maximum number we want for all classes
    max_allowable = freqs.most_common()[-1][1]
    num_added = {clss: 0 for clss in freqs.keys()}
    new_ys = []
    new_xs = []
    for i, y in enumerate(ys):
        if num_added[y] < max_allowable:
            new_ys.append(y)
            new_xs.append(xs[i])
            num_added[y] += 1
    return new_xs, new_ys

In [9]:
time_marker('All Reviews')
time_marker(Counter(stars_all))
bal_texts_all, bal_stars_all = balance_classes(texts_all, stars_all)
time_marker(Counter(bal_stars_all))

print('')
time_marker('United States Reviews')
time_marker(Counter(stars_us))
bal_texts_us, bal_stars_us = balance_classes(texts_us, stars_us)
time_marker(Counter(bal_stars_us))

print('')
time_marker('Canada Reviews')
time_marker(Counter(stars_can))
bal_texts_can, bal_stars_can = balance_classes(texts_can, stars_can)
time_marker(Counter(bal_stars_can))

print('')
time_marker('Germany Reviews')
time_marker(Counter(stars_ger))
bal_texts_ger, bal_stars_ger = balance_classes(texts_ger, stars_ger)
time_marker(Counter(bal_stars_ger))

print('')
time_marker('United Kingdom Reviews')
time_marker(Counter(stars_uk))
bal_texts_uk, bal_stars_uk = balance_classes(texts_uk, stars_uk)
time_marker(Counter(bal_stars_uk))

[17:28:52.376209] All Reviews
[17:28:52.512527] Counter({5: 603049, 4: 469453, 3: 241338, 1: 195444, 2: 167430})
[17:28:53.287151] Counter({3: 167430, 1: 167430, 2: 167430, 4: 167430, 5: 167430})

[17:28:53.287443] United States Reviews
[17:28:53.385059] Counter({5: 503328, 4: 353557, 3: 177083, 1: 159700, 2: 130004})
[17:28:54.006007] Counter({1: 130004, 4: 130004, 2: 130004, 3: 130004, 5: 130004})

[17:28:54.006300] Canada Reviews
[17:28:54.031086] Counter({4: 107626, 5: 91133, 3: 60086, 2: 35184, 1: 33927})
[17:28:54.192971] Counter({3: 33927, 1: 33927, 2: 33927, 4: 33927, 5: 33927})

[17:28:54.193317] Germany Reviews
[17:28:54.194313] Counter({5: 4499, 4: 4061, 3: 2138, 2: 1409, 1: 1154})
[17:28:54.199817] Counter({5: 1154, 4: 1154, 2: 1154, 3: 1154, 1: 1154})

[17:28:54.200093] United Kingdom Reviews
[17:28:54.201069] Counter({4: 4209, 5: 4089, 3: 2031, 2: 833, 1: 663})
[17:28:54.204840] Counter({4: 663, 5: 663, 1: 663, 2: 663, 3: 663})


# Vectorize Text

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
# instantiate vectorizer, single words and bi-grams
vectorizer = TfidfVectorizer(ngram_range=(1,2))

In [12]:
time_marker('vectorizing {:d} review texts...'.format(len(bal_texts_all)))
vectors_all = vectorizer.fit_transform(bal_texts_all)
time_marker('done')

[17:28:54.488505] vectorizing 837150 review texts...
[17:37:18.148617] done


In [13]:
time_marker('vectorizing {:d} united states review texts...'.format(len(bal_texts_us)))
vectors_us = vectorizer.fit_transform(bal_texts_us)
time_marker('done')

[17:37:18.288498] vectorizing 650020 united states review texts...
[17:43:51.024587] done


In [14]:
time_marker('vectorizing {:d} canada review texts...'.format(len(bal_texts_can)))
vectors_can = vectorizer.fit_transform(bal_texts_can)
time_marker('done')

[17:43:51.088224] vectorizing 169635 canada review texts...
[17:45:39.199268] done


In [15]:
time_marker('vectorizing {:d} germany review texts...'.format(len(bal_texts_ger)))
vectors_ger = vectorizer.fit_transform(bal_texts_ger)
time_marker('done')

[17:45:39.225239] vectorizing 5770 germany review texts...
[17:45:43.444540] done


In [16]:
time_marker('vectorizing {:d} united kingdom review texts...'.format(len(bal_texts_uk)))
vectors_uk = vectorizer.fit_transform(bal_texts_uk)
time_marker('done')

[17:45:43.469205] vectorizing 3315 united kingdom review texts...
[17:45:45.548397] done


# Train and Test Splits

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
time_marker('splitting all reviews into train and test sets...')
balanced_X_train_all, balanced_X_test_all, balanced_y_train_all, balanced_y_test_all = train_test_split(vectors_all, bal_stars_all, 
                                                                                        test_size=0.3, random_state=42)
time_marker('done!')

[17:45:45.754249] splitting all reviews into train and test sets...
[17:46:19.384215] done!


In [19]:
time_marker('splitting united states reviews into train and test sets...')
balanced_X_train_us, balanced_X_test_us, balanced_y_train_us, balanced_y_test_us = train_test_split(vectors_us, bal_stars_us, 
                                                                                        test_size=0.3, random_state=42)
time_marker('done!')

[17:46:19.425526] splitting united states reviews into train and test sets...
[17:46:45.491609] done!


In [20]:
time_marker('splitting canada reviews into train and test sets...')
balanced_X_train_can, balanced_X_test_can, balanced_y_train_can, balanced_y_test_can = train_test_split(vectors_can, bal_stars_can, 
                                                                                        test_size=0.3, random_state=42)
time_marker('done!')

[17:46:45.537308] splitting canada reviews into train and test sets...
[17:46:51.526957] done!


In [21]:
time_marker('splitting germany reviews into train and test sets...')
balanced_X_train_ger, balanced_X_test_ger, balanced_y_train_ger, balanced_y_test_ger = train_test_split(vectors_ger, bal_stars_ger, 
                                                                                        test_size=0.3, random_state=42)
time_marker('done!')

[17:46:51.547341] splitting germany reviews into train and test sets...
[17:46:51.714236] done!


In [22]:
time_marker('splitting united kingdom reviews into train and test sets...')
balanced_X_train_uk, balanced_X_test_uk, balanced_y_train_uk, balanced_y_test_uk = train_test_split(vectors_uk, bal_stars_uk, 
                                                                                        test_size=0.3, random_state=42)
time_marker('done!')

[17:46:51.735937] splitting united kingdom reviews into train and test sets...
[17:46:51.839769] done!


# SVM Classifier

In [23]:
from sklearn.svm import LinearSVC

In [24]:
# train classifier
clf_all = LinearSVC()
time_marker('training all reviews classifier...')
clf_all.fit(balanced_X_train_all, balanced_y_train_all)
time_marker('done!')

[17:46:52.360639] training all reviews classifier...
[17:51:33.768474] done!


In [25]:
clf_us  = LinearSVC()
time_marker('training united states reviews classifier...')
clf_us.fit(balanced_X_train_us, balanced_y_train_us)
time_marker('done!')

[17:51:33.821869] training united states reviews classifier...
[17:54:44.629736] done!


In [26]:
clf_can = LinearSVC()
time_marker('training canada reviews classifier...')
clf_can.fit(balanced_X_train_can, balanced_y_train_can)
time_marker('done!')

[17:54:44.647175] training canada reviews classifier...
[17:55:28.239121] done!


In [27]:
clf_ger = LinearSVC()
time_marker('training germany reviews classifier...')
clf_ger.fit(balanced_X_train_ger, balanced_y_train_ger)
time_marker('done!')

[17:55:28.252262] training germany reviews classifier...
[17:55:29.235877] done!


In [28]:
clf_uk  = LinearSVC()
time_marker('training united kingdom reviews classifier...')
clf_uk.fit(balanced_X_train_uk, balanced_y_train_uk)
time_marker('done!')

[17:55:29.249164] training united kingdom reviews classifier...
[17:55:29.749845] done!


# Make Predictions

In [29]:
# make predictions
balanced_pred_y_all = clf_all.predict(balanced_X_test_all)
print('predictions {}'.format(list(balanced_pred_y_all)[:10]))
print('actual      {}'.format(balanced_y_test_all[:10]))

predictions [4, 1, 5, 5, 1, 2, 2, 4, 1, 3]
actual      [4, 1, 5, 5, 1, 2, 2, 4, 1, 3]


In [30]:
balanced_pred_y_us = clf_us.predict(balanced_X_test_us)
print('predictions {}'.format(list(balanced_pred_y_us)[:10]))
print('actual      {}'.format(balanced_y_test_us[:10]))

predictions [3, 1, 1, 1, 5, 3, 5, 5, 5, 1]
actual      [3, 1, 1, 1, 5, 3, 5, 5, 5, 1]


In [31]:
balanced_pred_y_can = clf_can.predict(balanced_X_test_can)
print('predictions {}'.format(list(balanced_pred_y_can)[:10]))
print('actual      {}'.format(balanced_y_test_can[:10]))

predictions [1, 4, 4, 2, 5, 1, 1, 5, 1, 4]
actual      [2, 4, 5, 2, 4, 1, 3, 5, 1, 4]


In [32]:
balanced_pred_y_ger = clf_ger.predict(balanced_X_test_ger)
print('predictions {}'.format(list(balanced_pred_y_ger)[:10]))
print('actual      {}'.format(balanced_y_test_ger[:10]))

predictions [3, 2, 2, 2, 5, 2, 1, 1, 3, 5]
actual      [4, 2, 3, 3, 3, 3, 4, 1, 3, 4]


In [33]:
balanced_pred_y_uk = clf_uk.predict(balanced_X_test_uk)
print('predictions {}'.format(list(balanced_pred_y_uk)[:10]))
print('actual      {}'.format(balanced_y_test_uk[:10]))

predictions [3, 5, 4, 2, 4, 5, 1, 3, 1, 1]
actual      [3, 5, 5, 2, 5, 3, 1, 4, 1, 1]


# Evaluation of the Model

In [34]:
from sklearn.metrics import accuracy_score

In [35]:
print('All Reviews Accuracy Score:    {:2.4f}'.format(accuracy_score(balanced_y_test_all, balanced_pred_y_all)))
print('United States Accuracy Score:  {:2.4f}'.format(accuracy_score(balanced_y_test_us, balanced_pred_y_us)))
print('Canada Accuracy Score:         {:2.4f}'.format(accuracy_score(balanced_y_test_can, balanced_pred_y_can)))
print('Germany Accuracy Score:        {:2.4f}'.format(accuracy_score(balanced_y_test_ger, balanced_pred_y_ger)))
print('United Kingdom Accuracy Score: {:2.4f}'.format(accuracy_score(balanced_y_test_uk, balanced_pred_y_uk)))

All Reviews Accuracy Score:    0.6193
United States Accuracy Score:  0.6245
Canada Accuracy Score:         0.6000
Germany Accuracy Score:        0.4864
United Kingdom Accuracy Score: 0.5387


# Evaluation Report

In [36]:
from sklearn.metrics import classification_report

In [37]:
print(classification_report(balanced_y_test_all, balanced_pred_y_all))

             precision    recall  f1-score   support

          1       0.71      0.77      0.74     50066
          2       0.57      0.52      0.54     50489
          3       0.57      0.54      0.56     50450
          4       0.54      0.52      0.53     49862
          5       0.68      0.74      0.71     50278

avg / total       0.61      0.62      0.62    251145



In [38]:
print(classification_report(balanced_y_test_us, balanced_pred_y_us))

             precision    recall  f1-score   support

          1       0.70      0.77      0.74     39001
          2       0.56      0.51      0.53     39212
          3       0.57      0.55      0.56     38727
          4       0.57      0.54      0.55     39194
          5       0.70      0.75      0.72     38872

avg / total       0.62      0.62      0.62    195006



In [39]:
print(classification_report(balanced_y_test_can, balanced_pred_y_can))

             precision    recall  f1-score   support

          1       0.71      0.78      0.75     10348
          2       0.55      0.52      0.54     10258
          3       0.54      0.51      0.53     10188
          4       0.51      0.47      0.49     10142
          5       0.65      0.72      0.68      9955

avg / total       0.59      0.60      0.60     50891



In [40]:
print(classification_report(balanced_y_test_ger, balanced_pred_y_ger))

             precision    recall  f1-score   support

          1       0.50      0.69      0.58       318
          2       0.42      0.35      0.38       360
          3       0.46      0.37      0.41       356
          4       0.46      0.37      0.41       359
          5       0.55      0.68      0.61       338

avg / total       0.48      0.49      0.48      1731



In [41]:
print(classification_report(balanced_y_test_uk, balanced_pred_y_uk))

             precision    recall  f1-score   support

          1       0.60      0.80      0.69       193
          2       0.45      0.39      0.42       194
          3       0.51      0.40      0.44       202
          4       0.46      0.45      0.46       207
          5       0.63      0.66      0.64       199

avg / total       0.53      0.54      0.53       995

