# A Sentiment Analysis of Yelp Reviews
## Author: Robert Surridge

In [2]:
# IMPORT NECESSARY PACKAGES

import pandas as pd
import json
import nltk
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
%matplotlib inline
import time
from sklearn.ensemble import RandomForestClassifier
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from itertools import product

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rsurridge/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# LOAD AND MODEL YELP DATA TO PRINT OUT MACHINE LEARNING CLASSIFICATION REPORTS

yelp_data = pd.read_json('..Downloads/yelp_data/yelp_academic_dataset_review.json', 
                         lines=True, chunksize=100_000)
for chunk in yelp_data:
    yelp_sample_unequal = chunk
    result = chunk.to_json(orient="records")
    with open("yelp_sample.json", "w") as f:
        json.dump(result, f)
    break

yelp_sample_unequal['stars'] = yelp_sample_unequal['stars'].astype(float)

unigram_vocab = (CountVectorizer(ngram_range=(1, 1), stop_words='english')
                 .fit(yelp_sample_unequal.loc[:, 'text']))

bigram_vocab = (CountVectorizer(ngram_range=(2, 2), stop_words='english')
                .fit(yelp_sample_unequal.loc[:, 'text']))

trigram_vocab = (CountVectorizer(ngram_range=(3, 3), stop_words='english')
                .fit(yelp_sample_unequal.loc[:, 'text']))

yelp_classify = yelp_sample_unequal.loc[:, ['stars', 'text']]

x_unequal = yelp_classify['text']
y_unequal = yelp_classify['stars']

unequal_count = y_unequal.value_counts()
min_count = unequal_count.min()
yelp_sample_equal = (yelp_sample_unequal.groupby('stars').apply(lambda x: x[:min_count]))
equal_count = yelp_sample_equal['stars'].value_counts()

x_equal = yelp_sample_equal['text']
y_equal = yelp_sample_equal['stars']

combo = list(product(('all_stars', '1_5_stars', '1_3_5_stars'), 
                     ('unigram', 'bigram', 'trigram'), 
                     ('equal','unequal')))

new_combo = list(product(('1_5_stars', '1_3_5_stars'), 
                         ('unigram', 'bigram', 'trigram'), 
                         ('equal','unequal')))
new_combo = (new_combo + [('all_stars','unigram', 'equal')] 
                       + [('all_stars','bigram', 'equal')] 
                       + [('all_stars','unigram', 'unequal')])

brand_new_combo = list(product(('1_5_stars', '1_3_5_stars', 'all_stars'),
                               ('unigram', 'bigram'), 
                               ('equal', 'unequal')))
brand_new_combo = (brand_new_combo + [('1_5_stars', 'trigram', 'equal')] 
                                   + [('1_3_5_stars', 'trigram', 'equal')])

def star_df(star, df):
    """
    This function takes in a string of star classification and subsets the 
    rows of the data for the corressponding stars
    """
    
    if 'all_stars' == star:
        return  df
    elif '1_5_stars' == star:
        return df[(df['stars']==1) | (df['stars']==5)]
    else:
        return df[(df['stars']==1) | (df['stars']==3) | (df['stars']==5)]
    
def n_gram_df(str, df):
    """
    This function takes in a string for the corresponding n-gram and the 
    subsetted star data frame. Then creates the ngram and then creates a 
    Spacy Sparce data matrix.
    """
    
    x_df = df['text']
    if str == 'unigram':
        unigram_vocab = (CountVectorizer(ngram_range=(1,1), stop_words='english')
                        .fit(df.loc[:, 'text']))
        return unigram_vocab.transform(x_df)
        
    elif str == 'bigram':
        bigram_vocab = (CountVectorizer(ngram_range = (2, 2), stop_words='english')
                        .fit(df.loc[:, 'text']))
        return bigram_vocab.transform(x_df)
    else: 
        trigram_vocab = (CountVectorizer(ngram_range = (3, 3), stop_words='english')
                        .fit(df.loc[:, 'text']))
        return trigram_vocab.transform(x_df)
    
def model_to_acuracy(model,x_train, x_test, y_train, y_test, combo):
    """
    This function takes in the train and test data each variation, 
    visualizes, calculates model prediction, and returns accuracy score as a numpy int.
    """

    model.fit(x_train, y_train)
    predmnb = model.predict(x_test)
    y = pd.concat([y_train, y_test])

    #accuracy score only for even data
    if y.value_counts().nunique() == 1:
        print()
        score = round(accuracy_score(y_test, predmnb) * 100, 2)
        print(combo, "Accuracy Score:", score)

    #f1 for uneven data 
    else:
        print()
        score = round(f1_score(y_test, predmnb, average='weighted') * 100, 2)
        print(combo, "f1_score:", score)

    print()
    print(combo, "Classification Report:")
    print(classification_report(y_test, predmnb))
    return score

def hyper_tuning(yelp_sample_equal, yelp_sample_unequal, model, hyper_combo, model_name):
    """
    This function calculates the accuracy score for each variation of 
    hyperpermeters for a given model
    """

    score_lst = []
    time_lst = []
    print()
    print(model_name)
    for tup in hyper_combo:
        
        star, gram, equal = tup

        if equal == 'equal':

            yelp_classify_equal = yelp_sample_equal.loc[:, ['stars', 'text']]
            star_class_df = star_df(star, yelp_classify_equal)
            x_df = n_gram_df(gram, star_class_df)
        else: 
            yelp_classify_unequal = yelp_sample_unequal.loc[:, ['stars', 'text']]
            star_class_df = star_df(star, yelp_classify_unequal)
            x_df = n_gram_df(gram, star_class_df)

        y_df = star_class_df['stars']

        x_train, x_test, y_train, y_test = train_test_split(x_df, y_df,
                                                            test_size=0.2, 
                                                            random_state=101)
        start_time = time.time()
        score = model_to_acuracy(model,x_train, x_test, y_train, y_test, tup)
        end_time = time.time()
        score_lst.append(score)
        time_lst.append((float(end_time) - float(start_time)) / float(60))
        print('Runtime in minutes: ', (float(end_time) - float(start_time)) / float(60))

    return score_lst, time_lst

score, time_lst_nb = hyper_tuning(yelp_sample_equal, 
                                  yelp_sample_unequal, 
                                  MultinomialNB(), 
                                  combo,
                                  "Multinomial Naive Bayes")

dt_score, time_lst_dt = hyper_tuning(yelp_sample_equal,
                                     yelp_sample_unequal, 
                                     DecisionTreeClassifier(), 
                                     brand_new_combo,
                                     "Decision Tree Classifier")

rf_score, time_lst_rf  = hyper_tuning(yelp_sample_equal, 
                                      yelp_sample_unequal, 
                                      RandomForestClassifier(), 
                                      brand_new_combo,
                                      "Random Forest Classifier")

var_holder = {}
max_iter_list = [100,1000,10000] 

for iter in max_iter_list:
    (var_holder['lr_score_' + str(iter)], 
     var_holder['time_lst_' + str(iter)]) = hyper_tuning(yelp_sample_equal, 
                                                         yelp_sample_unequal, 
                                                         LogisticRegression(max_iter=iter), 
                                                         new_combo,
                                                         "Logistic Regression: " + str(iter) + " Iterations")


Multinomial Naive Bayes

('all_stars', 'unigram', 'equal') Accuracy Score: 51.38

('all_stars', 'unigram', 'equal') Classification Report:
              precision    recall  f1-score   support

         1.0       0.62      0.65      0.64      1626
         2.0       0.44      0.43      0.44      1584
         3.0       0.43      0.45      0.44      1650
         4.0       0.45      0.49      0.47      1596
         5.0       0.66      0.54      0.60      1532

    accuracy                           0.51      7988
   macro avg       0.52      0.51      0.52      7988
weighted avg       0.52      0.51      0.52      7988


('all_stars', 'unigram', 'unequal') f1_score: 58.44

('all_stars', 'unigram', 'unequal') Classification Report:
              precision    recall  f1-score   support

         1.0       0.58      0.69      0.63      2168
         2.0       0.36      0.22      0.27      1566
         3.0       0.38      0.26      0.31      2339
         4.0       0.45      0.56      0.