In [1]:
import pandas as pd
import numpy as ny
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import plotly.express as px
import os 
import sklearn.model_selection as ms
from sklearn.linear_model import LogisticRegression
import sklearn.linear_model as lm
from sklearn import metrics as mt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score,brier_score_loss, precision_score, recall_score,f1_score)
import time

In [2]:
df_business_eda = pd.read_pickle("~/Documents/yelp_datasets/df_business_eda_proto4.pickle")

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import cross_val_score

In [74]:
num_estimators = 50

tree_based_models = {
    'Stump':             DecisionTreeClassifier(max_depth=1, min_samples_leaf=1),
    'Tree':              DecisionTreeClassifier(),
    'Random Trees':      RandomForestClassifier(max_depth=50, n_estimators=num_estimators),
    'Extra Random Trees': ExtraTreesClassifier(n_estimators=num_estimators,min_samples_split=2),
    'Boosted Tree':       GradientBoostingClassifier(n_estimators=num_estimators), #takes a long time
}

In [71]:
#ETL
categorical_cols=df_business_eda.select_dtypes(include=['object']).columns

df_business_hot=df_business_eda

for col in categorical_cols:
    dummies=pd.get_dummies(df_business_hot[col], dummy_na=True, prefix=col)
    df_business_hot=df_business_hot.\
        drop(col,axis=1).\
    merge(
        dummies,
        how='left',
        left_index=True,
        right_index=True
        )
    
df_business_hot=df_business_hot.fillna(False)

df_business_hot["above_average"] = ny.where(df_business_hot["stars"] > 3.5, 0, 1)
y=df_business_hot.rating_category
y2 = df_business_hot["above_average"]
X=df_business_hot.drop(["rating_category", "stars","above_average"], axis=1)
#Scale your data
scaler = StandardScaler()
scaler.fit(X) 
X_scaled = pd.DataFrame(scaler.transform(X),columns = X.columns)


In [29]:

num_cv_iterations = 3
num_instances = len(y)
cv_object = ms.StratifiedShuffleSplit(n_splits=num_cv_iterations,
                         random_state=123,
                         test_size  = 0.2)

In [41]:
def cv_train(name,model,x,y):
    print(f"====Performing Cross Validation for {name}")
    print(f" Iteration ", end = '')
    iter_num=0
    res=pd.DataFrame()

    for train_indices, test_indices in cv_object.split(x,y):
        t = time.time()
        print(f" {iter_num},", end = '')

        X_train = x.iloc[train_indices]
        y_train = y.iloc[train_indices]

        X_test = x.iloc[test_indices]
        y_test = y.iloc[test_indices]

        model.fit(X_train,y_train)  # train object
        y_hat = model.predict(X_test) # get test set precitions

        conf = mt.confusion_matrix(y_test,y_hat)
        
        row=pd.DataFrame([iter_num],columns=['Iteration'])
        row["accuracy"]=accuracy_score(y_test, y_hat)
        row["precision"]=precision_score(y_test, y_hat, average="macro")
        row["recall"]=recall_score(y_test, y_hat, average="macro")
        row["f1"]=f1_score(y_test, y_hat, average="macro")
        row["average_seconds"]= ny.round(time.time() - t)

        res=res.append(row)
        iter_num+=1
    
    #Summarize CV Results 
    summary=res.drop("Iteration",axis=1).agg("mean").to_frame().T
    #summary.insert(0,"model",model)
    summary.insert(0,"name",name)
    print("  Cross-validation complete")

    return(summary)

In [14]:
# Trying out Balanced classweight option
log_newton_model = LogisticRegression(penalty='l2', C=1.0, class_weight="balanced", solver='newton-cg' )

In [73]:
cv_results = cv_train("log_newton",log_newton_model, X_scaled, y2)

====Performing Cross Validation for log_newton
 Iteration  0, 1, 2,  Cross-validation complete


In [51]:
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA 

pca_nb = Pipeline(
    [('PCA',PCA(n_components=30,svd_solver='randomized')),
     ('CLF',GaussianNB())]
)

In [46]:
from sklearn.neighbors import KNeighborsClassifier

pca_knn1 = Pipeline(
    [('PCA',PCA(n_components=100,svd_solver='randomized')),
     ('CLF',KNeighborsClassifier(n_neighbors=1))]
)

In [49]:
knn5 = KNeighborsClassifier(n_neighbors=3)

In [80]:
#other_random_models = {"knn5":knn5,"pca_nb":pca_nb,"pca_knn1":pca_knn1}
other_random_models = {"pca_nb":pca_nb}

In [81]:
for name,model in other_random_models.items():
    cv_results=cv_results.append(cv_train(name,model,X_scaled,y2))

====Performing Cross Validation for pca_nb
 Iteration  0, 1, 2,  Cross-validation complete


In [82]:
cv_results.head()

Unnamed: 0,name,accuracy,precision,recall,f1,average_seconds
0,log_newton,0.686672,0.686698,0.686023,0.686067,8.333333
0,pca_nb,0.490857,0.569801,0.502416,0.338806,1.0


In [83]:
for name,model in tree_based_models.items():
    cv_results=cv_results.append(cv_train(name,model,X_scaled,y2))

====Performing Cross Validation for Stump
 Iteration  0, 1, 2,  Cross-validation complete
====Performing Cross Validation for Tree
 Iteration  0, 1, 2,  Cross-validation complete
====Performing Cross Validation for Random Trees
 Iteration  0, 1, 2,  Cross-validation complete
====Performing Cross Validation for Extra Random Trees
 Iteration  0, 1, 2,  Cross-validation complete
====Performing Cross Validation for Boosted Tree
 Iteration  0, 1, 2,  Cross-validation complete


In [86]:
cv_results.sort_values('accuracy')

Unnamed: 0,name,accuracy,precision,recall,f1,average_seconds
0,pca_nb,0.490857,0.569801,0.502416,0.338806,1.0
0,Stump,0.597424,0.607836,0.59318,0.581279,0.0
0,Tree,0.631912,0.631768,0.631805,0.63177,1.0
0,Extra Random Trees,0.67848,0.678364,0.677962,0.67801,12.0
0,log_newton,0.686672,0.686698,0.686023,0.686067,8.333333
0,Boosted Tree,0.690643,0.6906,0.690089,0.690147,12.333333
0,Random Trees,0.695899,0.695809,0.695456,0.695511,9.0
