### Tanzinian Water Pump Classification ###

### Model Evaluation


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import optuna

from sklearn.linear_model import Lasso, Ridge, LogisticRegression

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestRegressor, ExtraTreesRegressor
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, SCORERS, balanced_accuracy_score


pd.set_option('precision', 4)
pd.options.display.max_seq_items = None
pd.options.display.max_columns = 50

### Read in Testing and Training Data From Cleaning and EDA


In [2]:
X_train = pd.read_pickle('PKL/X_train.pkl')
y_train = pd.read_pickle('PKL/y_train.pkl')
X_test = pd.read_pickle('PKL/X_test.pkl')
y_test = pd.read_pickle('PKL/y_test.pkl')

In [3]:
X_train. urban_lga = X_train. urban_lga.astype(int)
X_test. urban_lga = X_test. urban_lga.astype(int)

In [4]:
X_train.rural_lga = X_train.rural_lga.astype(int)
X_test.rural_lga = X_test.rural_lga.astype(int)

In [5]:
X_train.urban_wards = X_train.urban_wards.astype(int)
X_test.urban_wards = X_test.urban_wards.astype(int)

In [6]:
X_train.rural_wards = X_train.rural_wards.astype(int)
X_test.rural_wards = X_test.rural_wards.astype(int)

In [7]:
X_train.public_meeting = X_train.public_meeting.astype(int)
X_test.public_meeting = X_test.public_meeting.astype(int)

In [8]:
X_train.permit = X_train.permit.astype(int)
X_test.permit = X_test.permit.astype(int)

### Get Dummies for Catergorical Variables

In [9]:

X_train_ohe = pd.get_dummies(X_train)
X_test_ohe = pd.get_dummies(X_test)

In [10]:
[x for x in X_train.columns if x not in X_test.columns]

[]

In [11]:
[x for x in X_test.columns if x not in X_train.columns]

[]

### Scale and resample the data
#### Made X_train and X_test simple variables. 


In [12]:
scale = StandardScaler()
X_train = scale.fit_transform(X_train_ohe)
X_test = scale.transform(X_test_ohe)

In [13]:
smote = SMOTE()
X_train, y_train = smote.fit_sample(X_train, y_train) 

### LogReg with Lasso 

In [14]:
LogRegLasso = LogisticRegression(penalty = 'l1', 
                                 tol = 0.0001, 
                                 C = 1, 
                                 solver='liblinear', 
                                 class_weight = 'balanced', 
                                 max_iter = 300)

In [None]:
LogRegLasso.fit(X_train, y_train)
y_pred_lasso = LogRegLasso.predict(X_test)

In [None]:
print('F1: ', f1_score(y_test, y_pred_lasso, average = 'weighted'))
print('Accuracy: ', balanced_accuracy_score(y_test, y_pred_lasso))

In [None]:
mod = open('PKL/LogRegLasso.pkl', 'wb')
pickle.dump(LogRegLasso, mod)
mod.close()

### Logistic Regression

In [None]:
LogReg = LogisticRegression(solver='liblinear',
                            class_weight = 'balanced', 
                            max_iter = 200)

LogReg.fit(X_train, y_train)
y_pred_logreg = LogReg.predict(X_test)

print('F1: ', f1_score(y_test, y_pred_logreg, average = 'weighted'))
print('Accuracy: ', balanced_accuracy_score(y_test, y_pred_logreg))

In [None]:
LogReg.summary()

In [None]:
mod = open('PKL/LogReg.pkl', 'wb')
pickle.dump(LogReg, mod)
mod.close()

### KNN 
First I will start with a simple KNN. 
Then we will use Optuna to run and attempt to find the best model. 

### Simple KNN

In [16]:
knnSimple = KNeighborsClassifier(n_neighbors = 5, 
                                 p = 2, 
                                 n_jobs = -1)

In [None]:
knnSimple.fit(X_train, y_train)
y_pred_knnSimple = knnSimple.predict(X_test)

In [None]:
print('F1: ', f1_score(y_test, y_pred_knnSimple, average = 'weighted'))
print('Accuracy: ', balanced_accuracy_score(y_test, y_pred_knnSimple))

Save Knn Model

In [None]:
mod = open('PKL/knnSimple.pkl', 'wb')
pickle.dump(knnSimple, mod)
mod.close()

### Running KNN with Optuna

In [516]:
def knn_objective(trial): 
    knn_neighbors = trial.suggest_int('n_neighbors', 1,10) 
    knn_p = trial.suggest_categorical('p', [1, 2])
    knn_leaf_size = trial.suggest_int('leaf_size', 2, 50)
    knn_algorithm = trial.suggest_categorical('algorithm', ['ball_tree', 'kd_tree'])
    
    knn = KNeighborsClassifier(n_neighbors = knn_neighbors, 
                               p = knn_p, 
                               leaf_size = knn_leaf_size,
                                algorithm =  knn_algorithm)
    cv = KFold(n_splits = 5, shuffle = True, random_state = 20)
    
    knn.fit(X_train, y_train)
    y_pred_optuna = knn.predict(X_test)
    return (1 - f1_score(y_test, y_pred_optuna, average = 'weighted'))

In [None]:
study = optuna.create_study()

In [None]:
study.optimize(knn_objective, n_trials = 100)

[I 2020-08-20 08:10:02,743] Trial 0 finished with value: 0.2525007574872148 and parameters: {'n_neighbors': 6, 'p': 1, 'leaf_size': 38, 'algorithm': 'kd_tree'}. Best is trial 0 with value: 0.2525007574872148.
[I 2020-08-20 08:29:52,961] Trial 1 finished with value: 0.26217077385548193 and parameters: {'n_neighbors': 10, 'p': 1, 'leaf_size': 18, 'algorithm': 'ball_tree'}. Best is trial 0 with value: 0.2525007574872148.
[I 2020-08-20 09:02:01,508] Trial 2 finished with value: 0.24328000217342982 and parameters: {'n_neighbors': 2, 'p': 1, 'leaf_size': 6, 'algorithm': 'ball_tree'}. Best is trial 2 with value: 0.24328000217342982.
[I 2020-08-20 09:14:35,998] Trial 3 finished with value: 0.2640357918244377 and parameters: {'n_neighbors': 9, 'p': 1, 'leaf_size': 30, 'algorithm': 'kd_tree'}. Best is trial 2 with value: 0.24328000217342982.
[I 2020-08-20 09:19:49,937] Trial 4 finished with value: 0.2736111234120412 and parameters: {'n_neighbors': 5, 'p': 2, 'leaf_size': 50, 'algorithm': 'kd_tre

### Optuna Model Findings

In [None]:
#Best is trial 2 of 6 with value: 0.24328000217342982

knnOptuna = KNeighborsClassifier(n_neighbors = 2, 
                                 p = 1, 
                                 leaf_size = 6, 
                                 algorithm = 'kd_tree', 
                                 n_jobs = -1)
knnOptuna.fit(X_train, y_train)
y_pred_knnOptuna = knnOptuna.predict(X_test)
print('F1: ', f1_score(y_test, y_pred_knnOptuna, average = 'weighted'))
print('Accuracy: ', balanced_accuracy_score(y_test, y_pred_knnOptuna))

In [None]:
mod = open('PKL/knnOptuna.pkl', 'wb')
pickle.dump(knnOptuna, mod)
mod.close()

### Simple Decision Tree

#### Starting with a few simple decision trees, then expant to Optuna to find a better model. 

In [74]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtcSimple = DecissionTreeClassifer()
dtcSimple.fit(X_train, y_train)
y_pred_dtcSimple = dtcSimple.predict(X_test)
print('F1: ', f1_score(y_test, y_pred_dtcSimple, average = 'weighted'))
print('Accuracy: ', balanced_accuracy_score(y_test, y_pred_dtcSimple))

In [None]:
mod = open('PKL/dtcSimple.pkl', 'wb')
pickle.dump(dtcSimple, mod)
mod.close()

### Decision Tree with Optuna

In [494]:
def dtc_objective(trial): 
    dtc_criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    dtc_splitter = trial.suggest_categorical('n_neighbors', ['best', 'random']) 
    dtc_maxdepth = trial.sugguest_int('max_depth', 1,10)
    dtc_maxfeatures = trial.suggest_int('max_features', .01, .7)
    
    dtc = DecisionTreeClassifier(criterion = dtc_criterion, 
                                 splitter = dtc_splitter, 
                                 max_depth = dtc_maxdepth, 
                                 max_features = dtc_maxfeatures, 
                                class_weight = 'balanced')
    
    dtc.fit(X_train, y_train)
    y_pred_dtc= dtc.predict(X_test)
    return (1 - f1_score(y_test, y_pred_dtc, average = 'weighted'))

In [None]:
study = optuna.create_study()
study.optimize(dtc_objective, n_trials = 100)

In [None]:
dtcOptuna = DecissionTreeClassifer()
dtcOptuna.fit(X_train,y_train)
y_pred_dtcOptuna = knnOptuna.predict(X_test)

print('F1: ', f1_score(y_test, y_pred_dtcOptuna, average = 'weighted'))
print('Accuracy: ', balanced_accuracy_score(y_test, y_pred_dtcOptuna))

In [None]:
mod = open('PKL/dtcOptuna.pkl', 'wb')
pickle.dump(dtcOptuna, mod)
mod.close()

### Random Forest

#### Random forest classifer, then expanded to test optuna. 

In [426]:
rfcSimple = RandomForestClassifier(random_state = 1, 
                                   n_estimators = 300, max_depth = 5, 
                                   max_features = 4, 
                                   class_weight = 'balanced', 
                                   criterion = 'gini')
rfcSimple.fit(X_train, y_train)
y_pred_rfcSimple = rfcSimple.predict(X_test)

print('F1: ', f1_score(y_test, y_pred_rfcSimple, average = 'weighted'))
print('Accuracy: ', balanced_accuracy_score(y_test, y_pred_rfcSimple))

In [427]:
mod = open('PKL/rfcSimple.pkl', 'wb')
pickle.dump(rfcSimple, mod)
mod.close()

RandomForestClassifier(class_weight='balanced', max_depth=5, max_features=4,
                       n_estimators=300, random_state=1)

F1:  0.653001914265447
Accuracy:  0.6015558609618851


### Random Forest with Optuna

In [455]:
def objective(trial): 
    rfc_max_depth = trial.suggest_int('max_depth', 5,10)
    rfc_n_estimators = trial.suggest_int('n_estimators', 300, 700) 
    rfc_max_features = trial.suggest_loguniform('max_features', .001, .2)
    rfc_criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    
    rfc = RandomForestClassifier(random_state = 1, max_depth = rfc_max_depth, n_estimators = rfc_n_estimators,
                                 max_features = rfc_max_features, criterion = rfc_criterion, class_weight = 'balanced')
    rfc.fit(X_train_sm, y_train_sm)
    y_pred_optuna = rfc.predict(X_test)
    return (1 - f1_score(y_test, y_pred_optuna, average = 'weighted'))

In [471]:
study = optuna.create_study()
study.optimize(objective, n_trials = 100)

KeyboardInterrupt: 

#### rfc Optuna

In [452]:
rfc_Optuna = RandomForestClassifier(random_state = 1, 
                                    n_estimators = 572, 
                                    max_depth = 10, 
                                   max_features = 0.06602383170294993, 
                                    class_weight = 'balanced', 
                                    criterion = 'gini')
rfc_Optuna.fit(X_train, y_train)
y_pred_Optuna = rfc_Optuna.predict(X_test)
print('F1: ', f1_score(y_test, y_pred_Optuna, average = 'weighted'))
print('Accuracy: ', balanced_accuracy_score(y_test, y_pred_Optuna))

In [None]:
mod = open('PKL/rfcOptuna.pkl', 'wb')
pickle.dump(rfcOptuna, mod)
mod.close()

#### rfc Optuna 2

In [472]:
rfc_Optuna2 = RandomForestClassifier(random_state = 1, n_estimators = 572, 
                                     max_depth = 10, 
                                     max_features = 0.06602383170294993, 
                                     class_weight = 'balanced', 
                                     criterion = 'gini')

rfc_Optuna2.fit(X_train_sm, y_train_sm)
y_pred_Optuna2 = rfc_Optuna2.predict(X_test)

print('F1: ', f1_score(y_test, y_pred_Optuna2, average = 'weighted'))
print('Accuracy: ', balanced_accuracy_score(y_test, y_pred_Optuna2))

In [473]:
mod = open('PKL/rfcOptuna2.pkl', 'wb')
pickle.dump(rfcOptuna2, mod)
mod.close()

### Plotting Feature Importances

In [None]:
def plot_feature_importances(model):
    n_features = X_train.shape[1]
    plt.figure(figsize=(10,8))
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), X_train.columns.values)
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')

### GridSearchCV 

### Voting Classifer

In [None]:
vote = VotingClassifier( estimators= [('knn', knnSimple),
                                      ('knnOptuna', knnOptuna),
                                      ('dtc', dtcSimple),
                                      ('rfc_Optuna2', rfc_Optuna2), 
                                      ('rfcOptuna', rfcOptuna)], 
                                        voting='hard')

vote.fit(X_train, y_train)
y_pred_vote = vote.predict(X_test)

print('F1: ', f1_score(y_test, y_pred_vote, average = 'weighted'))
print('Accuracy: ', balanced_accuracy_score(y_test, y_pred_vote))

In [None]:
mod = open('PKL/vote.pkl', 'wb')
pickle.dump(vote, mod)
mod.close()