## Modeling a Multi-Class Prediction with Stemmed Yelp Reviews

In [1]:
# imports libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## week 3 imports
import missingno as msno     
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Linear and general modeling imports
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Feature Engineering
from sklearn.preprocessing import StandardScaler, PolynomialFeatures   # Scale/transform/feature engineering
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
# imports mice
# from impyute.imputation.cs import mice

import patsy
# y, X = patsy.dmatrices(formula, data=diamonds, return_type='dataframe')

# GridSearch and Hyperparameter Tuning
# from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline

# Logistic and Classification metrics
from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, plot_roc_curve, roc_auc_score, recall_score, precision_score, f1_score, classification_report

# K Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
# from sklearn.model_selection import train_test_split, cross_val_score
# from sklearn.preprocessing import StandardScaler

# naive bayes imports
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

# SVMs
from sklearn.svm import LinearSVC, SVC

# Decision Trees
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text

# Import Bagging, Boosting, and Random Forests, and ExtraTrees (Extremely Randomized Trees)
from sklearn.ensemble import BaggingClassifier, BaggingRegressor, AdaBoostClassifier, AdaBoostRegressor, RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor

# NLP imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
# nltk.download() #  --> Download all, and then restart jupyter lab
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import FreqDist, pos_tag
import re

import json


%matplotlib inline
%config InlineBackend.figure_format = 'retina'

### Loading the Data and Splitting Data

In [2]:
# Loads the data
df = pd.read_csv('../../Data/reviews_stemmed.csv')

In [3]:
df.head()

Unnamed: 0,business_id,name,review_id,review_stars,text,amb_casual,amb_classy,amb_target,text_length,clean_text,clean_text_length,clean_text_stem,clean_text_stem_length
0,0lCiLKpjrinltPFbBby4sw,The Great Wall Restaurant,wve8w6gIuPpCfo5J--AHjg,3,"The menu sounded promising, with over fifty di...",0.0,0.0,0,121,menu sounded promising fifty different dishes ...,68,menu sound promis fifti differ dish differ sty...,66
1,0lCiLKpjrinltPFbBby4sw,The Great Wall Restaurant,5rFuHGGbimVxPHxgM0sNSA,3,This wasn't the worst Chinese food but it wasn...,0.0,0.0,0,78,wasn' worst chinese food wasn' best egg foo yo...,41,worst chines food best egg foo young dri overc...,39
2,0lCiLKpjrinltPFbBby4sw,The Great Wall Restaurant,2iD3Rdbw0DUzjZSqBq3hXQ,1,I have been coming to this restaurant for over...,0.0,0.0,0,52,coming restaurant 20 years purchased shrimp fr...,27,come restaur 20 year purchas shrimp fri rice g...,26
3,0lCiLKpjrinltPFbBby4sw,The Great Wall Restaurant,e61y5ZlNwg04mAGtcD3vbQ,5,My husband and I love this place.\nGreat price...,0.0,0.0,0,23,husband love place great price lot food make s...,13,husband love place great price lot food make s...,12
4,kZFTi8FKjs30EuzurZ3v3g,Donerick's Pub,38lN2ONaypsfBDLwhGxcSg,5,Great place for beverages with your friends wh...,0.0,0.0,0,61,great place beverages friends watch game lots ...,43,great place beverag friend watch game lot tv g...,43


In [76]:
# Checks the baseline
df['amb_target'].value_counts(normalize=True)

1    0.494773
3    0.210665
2    0.156087
0    0.138475
Name: amb_target, dtype: float64

In [77]:
# Takes a small sample of the data to test modeling on
df_sample = df.sample(150, random_state=42)

In [78]:
# Sets up X and y
X = df_sample['clean_text_stem']
y = df_sample['amb_target']

In [79]:
# Splits the data into training and test sets from sample
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    random_state=42)

In [80]:
# Instantiantes TfidfVectorizer
tvec = TfidfVectorizer()

In [81]:
# Fits and transforms the vectorizer on our corpus
X_train = tvec.fit_transform(X_train)
X_test = tvec.transform(X_test)

# Turns it into a dataframe
X_train_df = pd.DataFrame(X_train.toarray(), columns=tvec.get_feature_names())
X_test_df = pd.DataFrame(X_test.toarray(), columns=tvec.get_feature_names())

### Modeling - with all classifiers

In [82]:
# Baseline accuracy
y_test.value_counts(normalize=True)

1    0.447368
3    0.210526
2    0.184211
0    0.157895
Name: amb_target, dtype: float64

In [83]:
# Instantiates classifier models
tree = DecisionTreeClassifier(random_state=42)
tree_pruned = DecisionTreeClassifier(max_depth=5,  
                                     random_state=42)
svc = SVC(random_state=42)
bagging = BaggingClassifier(random_state=42, 
                            n_estimators=100)
rf = RandomForestClassifier(random_state=42)
rf_pruned = RandomForestClassifier(max_depth=5,
                                   random_state=42)
boost = AdaBoostClassifier(n_estimators=100, 
                           random_state=42)

In [84]:
# Creates a list of classifiers
class_list = [tree, tree_pruned, svc, bagging, rf, rf_pruned, boost]

In [85]:
# Final full classification function
def modeling_class(class_list, X=None, y=None, use_split_data=False, standard_scale=False, X_train=None, X_test=None, y_train=None, y_test=None):
    import pandas as pd
    from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, plot_roc_curve, roc_auc_score, recall_score, precision_score, f1_score, classification_report
    
    if not use_split_data:
    
        try:
            # train-test-split
            X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                                stratify=y,
                                                                random_state=42)
            if standard_scale:
                sc = StandardScaler()
                X_train_sc = sc.fit_transform(X_train)
                X_test_sc = sc.transform(X_test)
                X_train = pd.DataFrame(X_train_sc, columns=X.columns)
                X_test = pd.DataFrame(X_test_sc, columns=X.columns)
                print('Used standard scale')
        except: 
            return 'You need to set "use_split_data"=True if you want to pass already train-test split X and y'
    
    train_acc = []
    test_acc = []
    train_auc = []
    test_auc = []
    f1_train = []
    f1_test = []

    # for each classifier fit and score
    for classifier in class_list:
        classifier.fit(X_train, y_train)
        train_acc.append(classifier.score(X_train, y_train))
        test_acc.append(classifier.score(X_test, y_test))
#         train_auc.append(roc_auc_score(y_true=y_train, y_score=classifier.predict(X_train), multi_class='ovo'))
#         test_auc.append(roc_auc_score(y_true=y_test, y_score=classifier.predict(X_test), multi_class='ovo'))
        f1_train.append(f1_score(y_train, classifier.predict(X_train), average='micro'))
        f1_test.append(f1_score(y_test, classifier.predict(X_test), average='micro'))
    # combine into dataframe
    dict_of_list_df = pd.DataFrame({'train_acc': train_acc,
                                    'test_acc': test_acc,
#                                     'train_auc': train_auc,
#                                     'test_auc': test_auc, 
                                    'f1_train': f1_train,
                                    'f1_test': f1_test},
                                    index=[str(cl) for cl in class_list])

    ####################

#     list_of_dicts = []
#     for classifier in class_list:
#         metrics_dict = {}

#         classifier.fit(X_train, y_train)
#         metrics_dict['train_acc'] = classifier.score(X_train, y_train)
#         metrics_dict['test_acc'] = classifier.score(X_test, y_test)
#         metrics_dict['train_auc'] = roc_auc_score(y_true=y_train, y_score=classifier.predict(X_train))
#         metrics_dict['test_auc'] = roc_auc_score(y_true=y_test, y_score=classifier.predict(X_test))
#         metrics_dict['f1_train'] = f1_score(y_train, classifier.predict(X_train))
#         metrics_dict['f1_test'] = f1_score(y_test, classifier.predict(X_test))

            

#         list_of_dicts.append(metrics_dict)

#     list_of_dicts_df = pd.DataFrame(list_of_dicts,
#                                     index=[str(cl) for cl in class_list])
    
    return dict_of_list_df#, list_of_dicts_df

In [86]:
# runs the function on my list of classifiers
modeling_class(class_list=class_list, use_split_data=True, standard_scale=False, X_train=X_train_df, X_test=X_test_df, y_train=y_train, y_test=y_test)

Unnamed: 0,train_acc,test_acc,f1_train,f1_test
DecisionTreeClassifier(random_state=42),1.0,0.394737,1.0,0.394737
"DecisionTreeClassifier(max_depth=5, random_state=42)",0.642857,0.447368,0.642857,0.447368
SVC(random_state=42),1.0,0.447368,1.0,0.447368
"BaggingClassifier(n_estimators=100, random_state=42)",1.0,0.473684,1.0,0.473684
RandomForestClassifier(random_state=42),1.0,0.5,1.0,0.5
"RandomForestClassifier(max_depth=5, random_state=42)",0.535714,0.447368,0.535714,0.447368
"AdaBoostClassifier(n_estimators=100, random_state=42)",0.633929,0.473684,0.633929,0.473684


### Modeling with Random Forest Classifier with stemmed words

In [87]:
df.head()

Unnamed: 0,business_id,name,review_id,review_stars,text,amb_casual,amb_classy,amb_target,text_length,clean_text,clean_text_length,clean_text_stem,clean_text_stem_length
0,0lCiLKpjrinltPFbBby4sw,The Great Wall Restaurant,wve8w6gIuPpCfo5J--AHjg,3,"The menu sounded promising, with over fifty di...",0.0,0.0,0,121,menu sounded promising fifty different dishes ...,68,menu sound promis fifti differ dish differ sty...,66
1,0lCiLKpjrinltPFbBby4sw,The Great Wall Restaurant,5rFuHGGbimVxPHxgM0sNSA,3,This wasn't the worst Chinese food but it wasn...,0.0,0.0,0,78,wasn' worst chinese food wasn' best egg foo yo...,41,worst chines food best egg foo young dri overc...,39
2,0lCiLKpjrinltPFbBby4sw,The Great Wall Restaurant,2iD3Rdbw0DUzjZSqBq3hXQ,1,I have been coming to this restaurant for over...,0.0,0.0,0,52,coming restaurant 20 years purchased shrimp fr...,27,come restaur 20 year purchas shrimp fri rice g...,26
3,0lCiLKpjrinltPFbBby4sw,The Great Wall Restaurant,e61y5ZlNwg04mAGtcD3vbQ,5,My husband and I love this place.\nGreat price...,0.0,0.0,0,23,husband love place great price lot food make s...,13,husband love place great price lot food make s...,12
4,kZFTi8FKjs30EuzurZ3v3g,Donerick's Pub,38lN2ONaypsfBDLwhGxcSg,5,Great place for beverages with your friends wh...,0.0,0.0,0,61,great place beverages friends watch game lots ...,43,great place beverag friend watch game lot tv g...,43


In [88]:
df.shape

(86189, 13)

In [89]:
# Sets up X and y
X = df['clean_text_stem']
y = df['amb_target']

In [90]:
# Splits the data into training and test sets from sample
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify=y, 
                                                    random_state=42)

In [91]:
# Instantiates TfidfVectorizer 
tvec = TfidfVectorizer()

# Fits and transforms the vectorizer on our corpus
X_train = tvec.fit_transform(X_train)
X_test = tvec.transform(X_test)

# Turns it into a dataframe
X_train_df = pd.DataFrame(X_train.toarray(), columns=tvec.get_feature_names())
X_test_df = pd.DataFrame(X_test.toarray(), columns=tvec.get_feature_names())


In [92]:
# Instantiates the random forest classifier 
rf = RandomForestClassifier(max_depth=5, random_state=42)

In [93]:
# Fits a random forest model
rf.fit(X_train_df, y_train)

RandomForestClassifier(max_depth=5, random_state=42)

In [94]:
# Scores the random forest fit
rf.score(X_train_df, y_train), rf.score(X_test_df, y_test)

(0.49480979564053773, 0.4947558938184518)

In [95]:
# Finds the cross-validation score 
# cross_val_score(rf, X_train_df, y_train).mean()

In [96]:
# baseline
y_test.value_counts(normalize=True)

1    0.494756
3    0.210692
2    0.156070
0    0.138482
Name: amb_target, dtype: float64

In [97]:
y_test.value_counts()

1    10661
3     4540
2     3363
0     2984
Name: amb_target, dtype: int64

### Modeling with Random Forest Classifier with original text with stopwords removed

In [98]:
df.head()

Unnamed: 0,business_id,name,review_id,review_stars,text,amb_casual,amb_classy,amb_target,text_length,clean_text,clean_text_length,clean_text_stem,clean_text_stem_length
0,0lCiLKpjrinltPFbBby4sw,The Great Wall Restaurant,wve8w6gIuPpCfo5J--AHjg,3,"The menu sounded promising, with over fifty di...",0.0,0.0,0,121,menu sounded promising fifty different dishes ...,68,menu sound promis fifti differ dish differ sty...,66
1,0lCiLKpjrinltPFbBby4sw,The Great Wall Restaurant,5rFuHGGbimVxPHxgM0sNSA,3,This wasn't the worst Chinese food but it wasn...,0.0,0.0,0,78,wasn' worst chinese food wasn' best egg foo yo...,41,worst chines food best egg foo young dri overc...,39
2,0lCiLKpjrinltPFbBby4sw,The Great Wall Restaurant,2iD3Rdbw0DUzjZSqBq3hXQ,1,I have been coming to this restaurant for over...,0.0,0.0,0,52,coming restaurant 20 years purchased shrimp fr...,27,come restaur 20 year purchas shrimp fri rice g...,26
3,0lCiLKpjrinltPFbBby4sw,The Great Wall Restaurant,e61y5ZlNwg04mAGtcD3vbQ,5,My husband and I love this place.\nGreat price...,0.0,0.0,0,23,husband love place great price lot food make s...,13,husband love place great price lot food make s...,12
4,kZFTi8FKjs30EuzurZ3v3g,Donerick's Pub,38lN2ONaypsfBDLwhGxcSg,5,Great place for beverages with your friends wh...,0.0,0.0,0,61,great place beverages friends watch game lots ...,43,great place beverag friend watch game lot tv g...,43


In [99]:
# Sets up X and y
X = df['clean_text']
y = df['amb_target']

In [100]:
# Splits the data into training and test sets from sample
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify=y, 
                                                    random_state=42)

In [None]:
# Instantiates TfidfVectorizer 
tvec = TfidfVectorizer()

# Fits and transforms the vectorizer on our corpus
X_train = tvec.fit_transform(X_train)
X_test = tvec.transform(X_test)

# Turns it into a dataframe
X_train_df = pd.DataFrame(X_train.toarray(), columns=tvec.get_feature_names())
X_test_df = pd.DataFrame(X_test.toarray(), columns=tvec.get_feature_names())


In [None]:
# Instantiates the random forest classifier 
rf = RandomForestClassifier(max_depth=5, random_state=42)

In [None]:
# Fits a random forest model
rf.fit(X_train_df, y_train)

In [None]:
# Scores the random forest fit
rf.score(X_train_df, y_train), rf.score(X_test_df, y_test)

In [None]:
# Finds the cross-validation score 
# cross_val_score(rf, X_train_df, y_train).mean()

### Modeling with Random Forest Classifier with original text with stopwords removed AND undersampling the majority class

#### Modeling: Random Forest

In [66]:
# Sets up X and y
X = new_df['clean_text']
y = new_df['amb_target']

In [67]:
# Splits the data into training and test sets from sample
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify=y, 
                                                    random_state=42)

In [68]:
# Instantiates TfidfVectorizer 
tvec = TfidfVectorizer()

# Fits and transforms the vectorizer on our corpus
X_train = tvec.fit_transform(X_train)
X_test = tvec.transform(X_test)

# Turns it into a dataframe
X_train_df = pd.DataFrame(X_train.toarray(), columns=tvec.get_feature_names())
X_test_df = pd.DataFrame(X_test.toarray(), columns=tvec.get_feature_names())


In [69]:
# Instantiates the random forest classifier 
rf = RandomForestClassifier(max_depth=5, random_state=42)

In [70]:
# Fits a random forest model
rf.fit(X_train_df, y_train)

RandomForestClassifier(max_depth=5, random_state=42)

In [71]:
# Scores the random forest fit
rf.score(X_train_df, y_train), rf.score(X_test_df, y_test)

(0.3435765673175745, 0.3418018129123759)

In [72]:
# Finds the cross-validation score 
# cross_val_score(rf, X_train_df, y_train).mean()