# Model Testing

## Libraries

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

## Base Data Frame

In [2]:
reviews = pd.read_json('../Sentimedia/data/yelp_academic_dataset_review.json', lines=True, nrows=150000)

In [3]:
reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,lWC-xP3rd6obsecCYsGZRg,ak0TdVmGKo4pwqdJSTLwWw,buF9druCkbuXLX526sGELQ,4,3,1,1,Apparently Prides Osteria had a rough summer a...,2014-10-11 03:34:02
1,8bFej1QE5LXp4O05qjGqXA,YoVfDbnISlW0f7abNQACIg,RA4V8pr014UyUbDvI-LW2A,4,1,0,0,This store is pretty good. Not as great as Wal...,2015-07-03 20:38:25
2,NDhkzczKjLshODbqDoNLSg,eC5evKn1TWDyHCyQAwguUw,_sS2LBIGNT5NQb6PD1Vtjw,5,0,0,0,I called WVM on the recommendation of a couple...,2013-05-28 20:38:06
3,T5fAqjjFooT4V0OeZyuk1w,SFQ1jcnGguO0LYWnbbftAA,0AzLzHfOJgL7ROwhdww2ew,2,1,1,1,I've stayed at many Marriott and Renaissance M...,2010-01-08 02:29:15
4,sjm_uUcQVxab_EeLCqsYLg,0kA0PAJ8QFMeveQWHFqz2A,8zehGz9jnxPqXtOc7KaJxA,4,0,0,0,The food is always great here. The service fro...,2011-07-28 18:05:01


In [4]:
business = pd.read_json('../Sentimedia/data/yelp_academic_dataset_business.json', lines=True)
df_restaurants = business.copy()
df_restaurants = df_restaurants[df_restaurants.categories.notna()]
df_restaurants = df_restaurants[df_restaurants.categories.str.contains("Restaurants")]
austin = df_restaurants[df_restaurants.city == 'Austin']
austin = austin[austin.is_open == 1]

In [5]:
austin.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
41,NRPemqVb4qpWFF0Avq_6OQ,Eurasia Sushi Bar & Seafood,"7101 W Hwy 71, Ste C-13",Austin,TX,78735,30.234533,-97.877262,4.5,395,1,"{'Ambience': '{'touristy': False, 'hipster': F...","Bars, Nightlife, Cocktail Bars, Seafood, Resta...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-22:0', ..."
61,bRsDZ44CD3uhGnRY3NeQhQ,Wendy's,6247 Mcneil Drive,Austin,TX,78729,30.441875,-97.746581,2.0,46,1,"{'RestaurantsPriceRange2': '1', 'OutdoorSeatin...","Fast Food, Restaurants, Burgers","{'Monday': '6:30-1:0', 'Tuesday': '6:30-1:0', ..."
199,Pk4ZwXwUU50BDn5gqw_rKg,Johnny Carino's,9500 S IH-35 Service Rd,Austin,TX,78748,30.162081,-97.789132,3.0,136,1,"{'RestaurantsGoodForGroups': 'True', 'Business...","Italian, Salad, Pizza, Nightlife, Restaurants,...","{'Monday': '11:0-21:30', 'Tuesday': '11:0-21:3..."
253,Ieelu69Y23nbjKG3OGfwnw,McDonald's,5017 Hwy 290 W,Austin,TX,78735,30.232133,-97.823183,1.5,9,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsRes...","Restaurants, Coffee & Tea, Food, Burgers, Fast...","{'Monday': '7:0-22:30', 'Tuesday': '7:0-22:30'..."
258,IFB2K3BEZ2L_Mv5AbUD26Q,Chispas,214 W 4th St,Austin,TX,78701,30.266996,-97.745362,3.5,119,1,"{'RestaurantsTakeOut': 'True', 'OutdoorSeating...","Tex-Mex, Mexican, Tacos, Restaurants","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'..."


In [6]:
df = reviews.merge(austin, on='business_id')

In [7]:
X = df.text
y = df.stars_x

## Data Cleaning

In [8]:
import string
def clean_text(s):
    s = s.translate(str.maketrans('', '', string.punctuation))
    s = s.lower()
    return s

In [9]:
X = X.map(clean_text)

## Data train / test splitting

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [11]:
X_train

5920     i think ive been too stingy with the 5 star ra...
3274     amazing it was our first time to have eaten at...
5032     this is technically a review for the new burne...
10465    great company to use for catering i have worke...
1064     this place is usually busy but they manage the...
                               ...                        
6511     i love this place\n\ni am sad when mr chen run...
9966     holy shit this place was amazing i shouldve re...
1059     love everything about this place the burgers s...
7377     can you see in the dark  yes  well you should ...
8554     i came here for the first time the weekend pri...
Name: text, Length: 9490, dtype: object

## Bag of Words Modelling

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X_train)
X_bow.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Simplifying the target into binary reviews

In [13]:
def binary_review(x):
    if x >= 4:
        return 'positive'
    return 'negative'

In [14]:
y_test = y_test.map(binary_review)
y_train = y_train.map(binary_review)

## MultinomialNB Model

In [15]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate

nb_model = MultinomialNB()
cv_results = cross_validate(nb_model, X_bow, y_train, cv=5)

In [16]:
cv_results['test_score'].mean()

0.8644889357218124

In [17]:
nb_model.fit(X_bow, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [18]:
X_bow_test = vectorizer.transform(X_test)
X_bow_test.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [19]:
nb_model.predict(X_bow_test)

array(['positive', 'negative', 'negative', ..., 'positive', 'positive',
       'positive'], dtype='<U8')

In [20]:
y_test

3656     positive
12415    negative
7975     negative
3320     positive
2600     negative
           ...   
9337     positive
5558     positive
3529     positive
10783    positive
1484     positive
Name: stars_x, Length: 4068, dtype: object

In [31]:
improv = pd.Series(['Terrible place! Disgusting food and our family had to wait a long time to get minimum service'])
improv = improv.map(clean_text)

In [32]:
improv = vectorizer.transform(improv)

In [33]:
nb_model.predict(improv)

array(['negative'], dtype='<U8')

## Saving Model Locally

In [35]:
import joblib
joblib.dump(nb_model, '../nb_model.joblib')

['../nb_model.joblib']

## N-gram modelling

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vectorizer = TfidfVectorizer(ngram_range = (3,3))
X_gram_train = tf_idf_vectorizer.fit_transform(X_train)

In [39]:
cv_results = cross_validate(nb_model, X_gram_train, y_train, cv=5)
cv_results['test_score'].mean()

0.7354056902002106

## Finetuning

In [44]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

# Set parameters to search
parameters = {
    'tfidf__ngram_range': ((1,1), (2,2), (3,3)),
    'nb__alpha': (0.01,0.1,1),}

# Perform grid search
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, 
                           verbose=1, scoring = "accuracy", 
                           refit=True, cv=5)

grid_search.fit(X_train,y_train)
# Set parameters to search (model and vectorizer)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   16.9s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [45]:
grid_search.best_params_

{'nb__alpha': 0.01, 'tfidf__ngram_range': (2, 2)}

In [46]:
grid_search.best_score_

0.8557428872497365