### MODEL EVALUATION WITH OPTUNA

Optuna models I ran separately due to computational issues. 

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import optuna

from sklearn.linear_model import Lasso, Ridge, LogisticRegression

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestRegressor, ExtraTreesRegressor
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, SCORERS, balanced_accuracy_score
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, SCORERS, balanced_accuracy_score, plot_confusion_matrix, classification_report



pd.set_option('precision', 4)
pd.options.display.max_seq_items = None
pd.options.display.max_columns = 50

In [3]:
X = pd.read_pickle('PKL/X_train.pkl')
y = pd.read_pickle('PKL/y_train.pkl')

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 13, test_size = 0.2)

### Get Dummies

In [5]:
X_train_ohe = pd.get_dummies(X_train)
X_test_ohe = pd.get_dummies(X_test)

In [6]:
[x for x in X_train.columns if x not in X_test.columns]

[]

In [7]:
[x for x in X_test.columns if x not in X_train.columns]

[]

In [8]:
scale = StandardScaler()
X_train = scale.fit_transform(X_train_ohe)
X_test = scale.transform(X_test_ohe)

In [9]:
smote = SMOTE()
X_train, y_train = smote.fit_sample(X_train, y_train) 

### KNN OPTUNA

First I will start with a simple KNN. 
Then we will use Optuna to run and attempt to find the best model. 

In [516]:
def knn_objective(trial): 
    knn_neighbors = trial.suggest_int('n_neighbors', 1,10) 
    knn_p = trial.suggest_categorical('p', [1, 2])
    knn_leaf_size = trial.suggest_int('leaf_size', 2, 50)
    knn_algorithm = trial.suggest_categorical('algorithm', ['ball_tree', 'kd_tree'])
    
    knn = KNeighborsClassifier(n_neighbors = knn_neighbors, 
                               p = knn_p, 
                               leaf_size = knn_leaf_size,
                                algorithm =  knn_algorithm)
    cv = KFold(n_splits = 5, shuffle = True, random_state = 20)
    
    knn.fit(X_train, y_train)
    y_pred_optuna = knn.predict(X_test)
    return (1 - f1_score(y_test, y_pred_optuna, average = 'weighted'))

In [None]:
study = optuna.create_study()

In [None]:
study.optimize(knn_objective, n_trials = 100)

[I 2020-08-20 08:10:02,743] Trial 0 finished with value: 0.2525007574872148 and parameters: {'n_neighbors': 6, 'p': 1, 'leaf_size': 38, 'algorithm': 'kd_tree'}. Best is trial 0 with value: 0.2525007574872148.
[I 2020-08-20 08:29:52,961] Trial 1 finished with value: 0.26217077385548193 and parameters: {'n_neighbors': 10, 'p': 1, 'leaf_size': 18, 'algorithm': 'ball_tree'}. Best is trial 0 with value: 0.2525007574872148.
[I 2020-08-20 09:02:01,508] Trial 2 finished with value: 0.24328000217342982 and parameters: {'n_neighbors': 2, 'p': 1, 'leaf_size': 6, 'algorithm': 'ball_tree'}. Best is trial 2 with value: 0.24328000217342982.
[I 2020-08-20 09:14:35,998] Trial 3 finished with value: 0.2640357918244377 and parameters: {'n_neighbors': 9, 'p': 1, 'leaf_size': 30, 'algorithm': 'kd_tree'}. Best is trial 2 with value: 0.24328000217342982.
[I 2020-08-20 09:19:49,937] Trial 4 finished with value: 0.2736111234120412 and parameters: {'n_neighbors': 5, 'p': 2, 'leaf_size': 50, 'algorithm': 'kd_tre

### DECISION TREE OPTUNA

### Decision Tree with Optuna

In [494]:
def dtc_objective(trial): 
    dtc_criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    dtc_splitter = trial.suggest_categorical('n_neighbors', ['best', 'random']) 
    dtc_maxdepth = trial.sugguest_int('max_depth', 1,10)
    dtc_maxfeatures = trial.suggest_int('max_features', .01, .7)
    
    dtc = DecisionTreeClassifier(criterion = dtc_criterion, 
                                 splitter = dtc_splitter, 
                                 max_depth = dtc_maxdepth, 
                                 max_features = dtc_maxfeatures, 
                                class_weight = 'balanced')
    
    dtc.fit(X_train, y_train)
    y_pred_dtc= dtc.predict(X_test)
    return (1 - f1_score(y_test, y_pred_dtc, average = 'weighted'))

In [None]:
study = optuna.create_study()

In [None]:
study.optimize(dtc_objective, n_trials = 100)

### RANDOM FOREST CLASSIFICATION OPTUNA

In [20]:
def objective(trial): 
    rfc_max_depth = trial.suggest_int('max_depth', 5,10)
    rfc_n_estimators = trial.suggest_int('n_estimators', 300, 700) 
    rfc_max_features = trial.suggest_loguniform('max_features', .001, .2)
    rfc_criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    
    rfc = RandomForestClassifier(random_state = 1, max_depth = rfc_max_depth, n_estimators = rfc_n_estimators,
                                 max_features = rfc_max_features, criterion = rfc_criterion, class_weight = 'balanced')
    rfc.fit(X_train, y_train)
    y_pred_optuna = rfc.predict(X_test)
    return (1 - f1_score(y_test, y_pred_optuna, average = 'weighted'))

In [21]:
study = optuna.create_study()

In [None]:
study.optimize(objective, n_trials = 100)

## After trials 

In [10]:
rfcOptunaLinux = RandomForestClassifier(random_state = 1, 
                                   n_estimators = 418, 
                                   max_depth = 10, 
                                   max_features = .13154680725540335, 
                                   class_weight = 'balanced', 
                                   criterion = 'gini')

In [11]:
rfcOptunaLinux.fit(X_train, y_train)
y_pred_rfcOptunaLinux = rfcOptunaLinux.predict(X_test)

In [12]:
print('F1: ', f1_score(y_test, y_pred_rfcOptunaLinux, average = 'weighted'))
print('Accuracy: ', balanced_accuracy_score(y_test, y_pred_rfcOptunaLinux))

F1:  0.7349713159724476
Accuracy:  0.6796134396559133


In [15]:
print(classification_report(y_test, y_pred_rfcOptunaLinux))

                         precision    recall  f1-score   support

             functional       0.78      0.80      0.79      4822
functional needs repair       0.28      0.61      0.38       678
         non functional       0.87      0.63      0.73      3410

               accuracy                           0.72      8910
              macro avg       0.64      0.68      0.63      8910
           weighted avg       0.77      0.72      0.73      8910



In [16]:
mod = open('PKL/rfcOptunaLinux.pkl', 'wb')
pickle.dump(rfcOptunaLinux, mod)
mod.close()

In [17]:
rfcOptunaLinux2 = RandomForestClassifier(random_state = 1, 
                                   n_estimators = 562, 
                                   max_depth = 10, 
                                   max_features = 0.073233610990557501, 
                                   class_weight = 'balanced', 
                                   criterion = 'gini', 
                                         n_jobs = -1)

In [18]:
rfcOptunaLinux2.fit(X_train, y_train)
y_pred_rfcOptunaLinux2 = rfcOptunaLinux2.predict(X_test)

In [19]:
print('F1: ', f1_score(y_test, y_pred_rfcOptunaLinux2, average = 'weighted'))
print('Accuracy: ', balanced_accuracy_score(y_test, y_pred_rfcOptunaLinux2))

F1:  0.7344743059151204
Accuracy:  0.684361800500234


In [20]:
print(classification_report(y_test, y_pred_rfcOptunaLinux2))

                         precision    recall  f1-score   support

             functional       0.79      0.78      0.78      4822
functional needs repair       0.28      0.62      0.38       678
         non functional       0.84      0.65      0.74      3410

               accuracy                           0.72      8910
              macro avg       0.64      0.68      0.63      8910
           weighted avg       0.77      0.72      0.73      8910



In [21]:
mod = open('PKL/rfcOptunaLinux2.pkl', 'wb')
pickle.dump(rfcOptunaLinux2, mod)
mod.close()