# Create Models	50	

Create a logistic regression model and a support vector machine model for the classification task involved with your dataset. Assess how well each model performs (use 80/20 training/testing split for your data). Adjust parameters of the models to make them more accurate. If your dataset size requires the use of stochastic gradient descent, then linear kernel only is fine to use. That is, the SGDClassifier is fine to use for optimizing logistic regression and linear support vector machines. For many problems, SGD will be required in order to train the SVM model in a reasonable timeframe. 

In [1]:
import pandas as pd
import numpy as ny
import seaborn as sb
import matplotlib.pyplot as plt
import plotly.express as px
import os 
import sklearn.model_selection as ms
from sklearn.linear_model import LogisticRegression
import sklearn.linear_model as lm
from sklearn import metrics as mt

from sklearn.metrics import (accuracy_score,brier_score_loss, precision_score, recall_score,
                             f1_score)

import time

In [2]:
!pip3 install pickle5
import socket
import pickle5 as pickle

is_rohit=socket.gethostname()=='Rohits-MacBook-Pro.local'
is_blake=socket.gethostname()=='BJH-ML-machine'

if(is_rohit):
    with open('~/Documents/yelp_datasets/df_business_eda.pickle', "rb") as f:
      pick_data = pickle.load(f)
      pick_data.to_pickle('~/Documents/yelp_datasets/df_business_eda_proto4.pickle')

    df_business_eda = pd.read_pickle("~/Documents/yelp_datasets/df_business_eda_proto4.pickle")
    
if(is_blake):
    df_business_eda = pd.read_pickle("df_business_eda.pickle")

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [3]:
df_business_eda

Unnamed: 0,rating_category,stars,Beauty & Spas,BusinessAcceptsCreditCards,Restaurants,is_open,parking_lot,city_state,zip3,RestaurantsPriceRange2,text,checkin_count,useful_count,cool_count,funny_count,review_count,review_word_count
1,2,5.0,False,False,False,True,False,Scottsdale_AZ,852,,,9.0,4,1,2,4,121.000000
3,0,2.5,False,True,False,False,True,North Las Vegas_NV,890,4.0,,3.0,1,0,0,3,26.666667
4,2,4.5,False,True,False,True,False,Mesa_AZ,852,,,1.0,11,3,6,26,86.962963
5,2,4.5,False,True,False,True,False,Gilbert_AZ,852,,4.0,39.0,25,2,6,38,81.690476
6,1,3.5,False,True,False,True,True,Las Vegas_NV,891,1.0,22.0,328.0,50,22,18,81,111.097561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209380,0,2.0,False,True,True,True,True,Phoenix_AZ,850,2.0,32.0,253.0,43,27,26,106,103.944954
209382,2,4.5,False,False,False,True,False,Las Vegas_NV,891,,9.0,168.0,73,30,10,124,143.593985
209384,2,5.0,False,True,False,True,False,Tempe _AZ,852,,,9.0,1,0,0,5,65.200000
209386,2,5.0,False,True,False,True,False,Las Vegas_NV,891,,11.0,8.0,89,49,52,217,100.977679


In [4]:
categorical_cols=df_business_eda.select_dtypes(include=['object']).columns

df_business_hot=df_business_eda

for col in categorical_cols:
    dummies=pd.get_dummies(df_business_hot[col], dummy_na=True, prefix=col)
    df_business_hot=df_business_hot.\
        drop(col,axis=1).\
    merge(
        dummies,
        how='left',
        left_index=True,
        right_index=True
        )
    
df_business_hot=df_business_hot.fillna(False)
df_business_hot.head()

Unnamed: 0,rating_category,stars,Beauty & Spas,BusinessAcceptsCreditCards,Restaurants,is_open,parking_lot,RestaurantsPriceRange2,text,checkin_count,...,zip3_928,zip3_930,zip3_940,zip3_952,zip3_953,zip3_959,zip3_967,zip3_981,zip3_nan,zip3_nan.1
1,2,5.0,False,False,False,True,False,False,False,9,...,0,0,0,0,0,0,0,0,0,0
3,0,2.5,False,True,False,False,True,4,False,3,...,0,0,0,0,0,0,0,0,0,0
4,2,4.5,False,True,False,True,False,False,False,1,...,0,0,0,0,0,0,0,0,0,0
5,2,4.5,False,True,False,True,False,False,4,39,...,0,0,0,0,0,0,0,0,0,0
6,1,3.5,False,True,False,True,True,1,22,328,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df_business_hot["above_average"] = ny.where(df_business_hot["stars"] > 3.5, 0, 1)
y=df_business_hot.rating_category
y2 = df_business_hot["above_average"]
X=df_business_hot.drop(["rating_category", "stars","above_average"], axis=1)

In [6]:
num_cv_iterations = 5
num_instances = len(y)
cv_object = ms.ShuffleSplit(n_splits=num_cv_iterations,
                         random_state=123,
                         test_size  = 0.2)
                         
print(cv_object)

ShuffleSplit(n_splits=5, random_state=123, test_size=0.2, train_size=None)


In [7]:
def cv_train(name,model,x,y):
    print(f"====Performing Cross Validation for {name}")
    print(f" Iteration ", end = '')


    iter_num=0
    
    res=pd.DataFrame()

    for train_indices, test_indices in cv_object.split(x,y):
        t = time.time()
        print(f" {iter_num},", end = '')

        X_train = x.iloc[train_indices]
        y_train = y.iloc[train_indices]

        X_test = x.iloc[test_indices]
        y_test = y.iloc[test_indices]

        model.fit(X_train,y_train)  # train object
        y_hat = model.predict(X_test) # get test set precitions

        conf = mt.confusion_matrix(y_test,y_hat)
        
        row=pd.DataFrame([iter_num],columns=['Iteration'])
        row["accuracy"]=accuracy_score(y_test, y_hat)
        row["precision"]=precision_score(y_test, y_hat, average="macro")
        row["recall"]=recall_score(y_test, y_hat, average="macro")
        row["f1"]=f1_score(y_test, y_hat, average="macro")
        row["elapsed_time"]= ny.round(time.time() - t)

        res=res.append(row)
        iter_num+=1
    
    #Summarize CV Results 
    summary=res.drop("Iteration",axis=1).agg("mean").to_frame().T
    summary.insert(0,"model",model)
    summary.insert(0,"name",name)
    print("  Cross-validation complete")

    return(summary)

# Logistic Regression

## 3 Classes

In [8]:
linear_model = LogisticRegression(penalty='l2', C=1.0, class_weight=None, solver='liblinear' )

Tried all solvers (‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’), liblinear --> gives the best accuracy

In [9]:
cv_train("linear_model",linear_model, X, y)

====Performing Cross Validation for linear_model
 Iteration  0, 1, 2, 3, 4,  Cross-validation complete


Unnamed: 0,name,model,accuracy,precision,recall,f1,elapsed_time
0,linear_model,LogisticRegression(solver='liblinear'),0.582411,0.541641,0.473398,0.472385,5.6


## 2 Class

In [10]:
cv_train("linear_model_binary",linear_model, X, y2)

====Performing Cross Validation for linear_model_binary
 Iteration  0, 1, 2, 3, 4,  Cross-validation complete


Unnamed: 0,name,model,accuracy,precision,recall,f1,elapsed_time
0,linear_model_binary,LogisticRegression(solver='liblinear'),0.690039,0.691012,0.688876,0.688678,3.2


## Logistic Regression - Stoichastic Gradient Descent 

## 3 Class

In [11]:
log_model = lm.SGDClassifier(n_jobs=-1, loss="log")

## 3 class

In [12]:
cv_train("sgd_logistic",log_model, X, y)

====Performing Cross Validation for sgd_logistic
 Iteration  0, 1, 2, 3, 4,  Cross-validation complete


Unnamed: 0,name,model,accuracy,precision,recall,f1,elapsed_time
0,sgd_logistic,"SGDClassifier(loss='log', n_jobs=-1)",0.515737,0.415606,0.437295,0.392106,3.0


## 2 class

In [13]:
cv_train("sgd_logistic_binary",log_model, X, y2)

====Performing Cross Validation for sgd_logistic_binary
 Iteration  0, 1, 2, 3, 4,  Cross-validation complete


Unnamed: 0,name,model,accuracy,precision,recall,f1,elapsed_time
0,sgd_logistic_binary,"SGDClassifier(loss='log', n_jobs=-1)",0.620362,0.625082,0.619219,0.614774,2.0


# Support Vector Machine

In [14]:
svm_model = lm.SGDClassifier(n_jobs=-1, loss="hinge")

## 3 Class

In [15]:
cv_train("sgd_svm",svm_model, X, y)

====Performing Cross Validation for sgd_svm
 Iteration  0, 1, 2, 3, 4,  Cross-validation complete


Unnamed: 0,name,model,accuracy,precision,recall,f1,elapsed_time
0,sgd_svm,SGDClassifier(n_jobs=-1),0.507118,0.459914,0.455165,0.422441,2.4


## 2 Class

In [16]:
cv_train("sgd_svm_binary",svm_model, X, y2)

====Performing Cross Validation for sgd_svm_binary
 Iteration  0, 1, 2, 3, 4,  Cross-validation complete


Unnamed: 0,name,model,accuracy,precision,recall,f1,elapsed_time
0,sgd_svm_binary,SGDClassifier(n_jobs=-1),0.63623,0.662909,0.634812,0.615764,2.0


# Compare Competing Models

In [None]:
cv_train("linear_logistic",linear_model, X, y).\
    append(cv_train("linear_logistic_binary",linear_model, X, y2)).\
    append(cv_train("sgd_logistic",log_model, X, y)).\
    append(cv_train("sgd_logistic_binary",log_model, X, y2)).\
    append(cv_train("sgd_svm",svm_model, X, y)).\
    append(cv_train("sgd_svm_binary",svm_model, X, y2))

====Performing Cross Validation for linear_logistic
 Iteration  0, 1, 2, 3, 4,  Cross-validation complete
====Performing Cross Validation for linear_logistic_binary
 Iteration  0, 1, 2, 3, 4,  Cross-validation complete
====Performing Cross Validation for sgd_logistic
 Iteration  0,

# Model Advantages	10

Discuss the advantages of each model for each classification task. Does one type of model offer superior performance over another in terms of prediction accuracy? In terms of training time or efficiency? Explain in detail.

# Interpret Feature Importance	30

Use the weights from logistic regression to interpret the importance of different features for the classification task. Explain your interpretation in detail. Why do you think some variables are more important?

# Interpret Support Vectors	10

Look at the chosen support vectors for the classification task. Do these provide any insight into the data? Explain. If you used stochastic gradient descent (and therefore did not explicitly solve for support vectors), try subsampling your data to train the SVC model— then analyze the support vectors from the subsampled dataset.