In [17]:
import pickle5 as pickle
import socket
import pandas as pd
import numpy as ny
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import plotly.express as px
import os 
import time


import sklearn.model_selection as ms
from sklearn.linear_model import LogisticRegression
import sklearn.linear_model as lm
from sklearn import metrics as mt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score,brier_score_loss, precision_score, recall_score,f1_score)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


is_rohit=socket.gethostname()=='Rohits-MacBook-Pro.local'
is_blake=socket.gethostname()=='BJH-ML-machine'
is_neddy=(is_rohit+is_blake==0)

In [3]:
if(is_neddy):
    df_business_eda = pd.read_pickle("~/Documents/yelp_datasets/df_business_eda_proto4.pickle")

if(is_rohit):
    df_business_eda = pd.read_pickle("~/Documents/yelp_datasets/df_business_eda_proto4.pickle")

if(is_blake):
    df_business_eda = pd.read_pickle("df_business_eda.pickle")

In [4]:
#ETL
categorical_cols=df_business_eda.select_dtypes(include=['object']).columns

df_business_hot=df_business_eda

for col in categorical_cols:
    dummies=pd.get_dummies(df_business_hot[col], dummy_na=True, prefix=col)
    df_business_hot=df_business_hot.\
        drop(col,axis=1).\
    merge(
        dummies,
        how='left',
        left_index=True,
        right_index=True
        )
    
df_business_hot=df_business_hot.fillna(False)

df_business_hot["above_average"] = ny.where(df_business_hot["stars"] > 3.5, 0, 1)
y=df_business_hot.rating_category
y2 = df_business_hot["above_average"]
X=df_business_hot.drop(["rating_category", "stars","above_average"], axis=1)
#Scale your data
scaler = StandardScaler()
scaler.fit(X) 
X_scaled = pd.DataFrame(scaler.transform(X),columns = X.columns)

In [5]:
#Cross Validation 
num_cv_iterations = 3
num_instances = len(y)
cv_object = ms.StratifiedShuffleSplit(n_splits=num_cv_iterations,
                         random_state=123,
                         test_size  = 0.2)

def cv_train(name,model,x,y):
    print(f"====Performing Cross Validation for {name}")
    print(f" Iteration ", end = '')
    iter_num=0
    res=pd.DataFrame()

    for train_indices, test_indices in cv_object.split(x,y):
        t = time.time()
        print(f" {iter_num},", end = '')

        X_train = x.iloc[train_indices]
        y_train = y.iloc[train_indices]

        X_test = x.iloc[test_indices]
        y_test = y.iloc[test_indices]

        model.fit(X_train,y_train)  # train object
        y_hat = model.predict(X_test) # get test set precitions

        conf = mt.confusion_matrix(y_test,y_hat)
        
        row=pd.DataFrame([iter_num],columns=['Iteration'])
        row["accuracy"]=accuracy_score(y_test, y_hat)
        row["precision"]=precision_score(y_test, y_hat, average="macro")
        row["recall"]=recall_score(y_test, y_hat, average="macro")
        row["f1"]=f1_score(y_test, y_hat, average="macro")
        row["average_seconds"]= ny.round(time.time() - t)

        res=res.append(row)
        iter_num+=1
    
    #Summarize CV Results 
    summary=res.drop("Iteration",axis=1).agg("mean").to_frame().T
    #summary.insert(0,"model",model)
    summary.insert(0,"name",name)
    print("  Cross-validation complete")

    return(summary)

In [6]:
# Trying out Balanced classweight option
log_newton_model = LogisticRegression(penalty='l2', C=1.0, class_weight="balanced", solver='newton-cg' )

In [7]:
cv_results = cv_train("log_newton",log_newton_model, X_scaled, y2)

====Performing Cross Validation for log_newton
 Iteration  0, 1, 2,  Cross-validation complete


In [8]:
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA 

pca_nb = Pipeline(
    [('PCA',PCA(n_components=30,svd_solver='randomized')),
     ('CLF',GaussianNB())]
)

In [9]:
from sklearn.neighbors import KNeighborsClassifier

pca_knn1 = Pipeline(
    [('PCA',PCA(n_components=100,svd_solver='randomized')),
     ('CLF',KNeighborsClassifier(n_neighbors=1))]
)

In [10]:
knn5 = KNeighborsClassifier(n_neighbors=3)

In [11]:
other_random_models = {"knn5":knn5,"pca_nb":pca_nb,"pca_knn1":pca_knn1}

In [12]:
for name,model in other_random_models.items():
    cv_results=cv_results.append(cv_train(name,model,X_scaled,y2))

====Performing Cross Validation for knn5
 Iteration  0, 1, 2,  Cross-validation complete
====Performing Cross Validation for pca_nb
 Iteration  0, 1, 2,  Cross-validation complete
====Performing Cross Validation for pca_knn1
 Iteration  0, 1, 2,  Cross-validation complete


In [13]:
num_estimators = 50

tree_based_models = {
    'Stump':             DecisionTreeClassifier(max_depth=1, min_samples_leaf=1),
    'Tree':              DecisionTreeClassifier(),
    'Random Trees':      RandomForestClassifier(max_depth=50, n_estimators=num_estimators),
    'Extra Random Trees': ExtraTreesClassifier(n_estimators=num_estimators,min_samples_split=2),
    'Boosted Tree':       GradientBoostingClassifier(n_estimators=num_estimators), #takes a long time
}

In [14]:
for name,model in tree_based_models.items():
    cv_results=cv_results.append(cv_train(name,model,X_scaled,y2))

====Performing Cross Validation for Stump
 Iteration  0, 1, 2,  Cross-validation complete
====Performing Cross Validation for Tree
 Iteration  0, 1, 2,  Cross-validation complete
====Performing Cross Validation for Random Trees
 Iteration  0, 1, 2,  Cross-validation complete
====Performing Cross Validation for Extra Random Trees
 Iteration  0, 1, 2,  Cross-validation complete
====Performing Cross Validation for Boosted Tree
 Iteration  0, 1, 2,  Cross-validation complete


In [15]:
#Neural Network
from sklearn.neural_network import MLPClassifier #no parallel processing option so slow 

nn_based_models = {
        'mlp_adam': MLPClassifier(hidden_layer_sizes=(8,8,8), activation='relu', solver='adam', max_iter=500)
}
for name,model in nn_based_models.items():
    cv_results=cv_results.append(cv_train(name,model,X_scaled,y2))

====Performing Cross Validation for mlp_adam
 Iteration  0, 1, 2,  Cross-validation complete


In [16]:
cv_results.sort_values('accuracy',ascending=False)

Unnamed: 0,name,accuracy,precision,recall,f1,average_seconds
0,mlp_adam,0.713101,0.712967,0.712922,0.71292,84.666667
0,Random Trees,0.695231,0.695108,0.694836,0.694887,8.0
0,Boosted Tree,0.690643,0.6906,0.690089,0.690147,11.0
0,log_newton,0.686672,0.686698,0.686023,0.686067,6.333333
0,Extra Random Trees,0.679548,0.679418,0.679064,0.679113,11.0
0,knn5,0.634581,0.63441,0.634424,0.634405,17.0
0,Tree,0.628959,0.628792,0.628818,0.628796,1.0
0,Stump,0.597424,0.607836,0.59318,0.581279,0.0
0,pca_knn1,0.578186,0.578088,0.578124,0.578073,14.0
0,pca_nb,0.491107,0.569256,0.502645,0.339884,1.0


In [None]:
#GRID Search for top 3 Classification Candidates 

#Candidate 1: MLP GRID SEARCH 
mlp_parameters = {
    'hidden_layer_sizes': [(10,30,10),(20,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

mlp_classifier=MLPClassifier(max_iter=500)
mlp_classifier_cv=GridSearchCV(mlp_classifier,mlp_parameters,cv=2, n_jobs=-1)
mlp_classifier_cv.fit(X_scaled,y2)
print(type)
print(mlp_classifier_cv.best_params_)
print("-----")

In [None]:
#Candidate 2: 

In [None]:
#Candidate 3: 

In [25]:
mlp_classifier_model = MLPClassifier(max_iter=500,
                                     activation= 'tanh', 
                                     alpha= 0.0001, 
                                     hidden_layer_sizes= (10, 30, 10), 
                                     learning_rate= 'adaptive', 
                                     solver= 'adam')

print(cv_train("mlp_best",mlp_classifier_model, X_scaled, y2))

====Performing Cross Validation for mlp_best
 Iteration  0, 1, 2,  Cross-validation complete
       name  accuracy  precision    recall        f1  average_seconds
0  mlp_best  0.714686   0.715041  0.714436  0.714299       122.666667


# Lab 3 Overview

> You are to build upon the predictive analysis (classification) that you already completed in the previous mini-project, adding additional modeling from new classification algorithms as well as more explanations that are inline with the CRISP-DM framework. You should use appropriate cross validation for all of your analysis (explain your chosen method of performance validation in detail). Try to use as much testing data as possible in a realistic manner (you should define what you think is realistic and why).

> You must identify two tasks from the dataset to regress or classify. That is:
• Two classification tasks OR
• Two regression tasks OR
• One classification task and one regression task
For example, if your dataset was from the diabetes data you might try to predict two tasks: (1) classifying if a patient will be readmitted within a 30 day period or not, and (2) regressing what the total number of days a patient will spend in the hospital, given their history and specifics of the encounter like tests administered and previous admittance.

# Data Preparation (15 points total)

> • [10 points] Define and prepare your class variables. Use proper variable representations (int, float, one-hot, etc.). Use pre-processing methods (as needed) for dimensionality reduction, scaling, etc. Remove variables that are not needed/useful for the analysis.

> • [5 points] Describe the final dataset that is used for classification/regression (include a description of any newly formed variables you created).

# Modeling and Evaluation (70 points total)


> • [10 points] Choose and explain your evaluation metrics that you will use (i.e., accuracy,
precision, recall, F-measure, or any metric we have discussed). Why are the measure(s) appropriate for analyzing the results of your modeling? Give a detailed explanation backing up any assertions.

> • [10 points] Choose the method you will use for dividing your data into training and
testing splits (i.e., are you using Stratified 10-fold cross validation? Why?). Explain why
your chosen method is appropriate or use more than one method as appropriate. For example, if you are using time series data then you should be using continuous training and testing sets across time.

> • [20 points] Create three different classification/regression models for each task (e.g., random forest, KNN, and SVM for task one and the same or different algorithms for task two). Two modeling techniques must be new (but the third could be SVM or logistic regression). Adjust parameters as appropriate to increase generalization performance using your chosen metric. You must investigate different parameters of the algorithms!

> • [10 points] Analyze the results using your chosen method of evaluation. Use visualizations of the results to bolster the analysis. Explain any visuals and analyze why they are interesting to someone that might use this model.

> • [10 points] Discuss the advantages of each model for each classification task, if any. If there are not advantages, explain why. Is any model better than another? Is the difference significant with 95% confidence? Use proper statistical comparison methods. You must use statistical comparison techniques—be sure they are appropriate for your chosen method of validation as discussed in unit 7 of the course.

> • [10 points] Which attributes from your analysis are most important? Use proper methods discussed in class to evaluate the importance of different attributes. Discuss the results and hypothesize about why certain attributes are more important than others for a given classification task.

# Deployment (5 points total)

> How useful is your model for interested parties (i.e., the companies or organizations that might want to use it for prediction)? How would you measure the model's value if it was used by these parties? How would your deploy your model for interested parties? What other data should be collected? How often would the model need to be updated, etc.? 

# Exceptional Work (10 points total)

> You have free reign to provide additional analyses. One idea: grid search parameters in a parallelized fashion and visualize the performances across attributes. Which parameters are most significant for making a good model for each classification algorithm?