In [20]:
#Basics
import pandas as pd
import numpy as np

#Plotting
import matplotlib.pyplot as plt
import seaborn as sns

#Train Test Split
from sklearn.model_selection import train_test_split

# Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Feature Selection
from sklearn.feature_selection import VarianceThreshold

# Classifiers
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.multiclass import OneVsRestClassifier

#Pipeline
from sklearn.pipeline import Pipeline

#Grid Search
from sklearn.model_selection import GridSearchCV

# Model evaluation
from sklearn.metrics import plot_confusion_matrix, accuracy_score

#Set Random State
random_state = 42

# Import Data

In [3]:
features = pd.read_csv('../data/training_features.csv', index_col='id')
targets = pd.read_csv('../data/training_labels.csv', index_col='id')
df = features.join(targets, how='left')
X = df.drop('status_group', axis=1)
y = df['status_group']

# Test Train Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state)

# Classifying Variables

In [5]:
# Variables
gps = [
    'latitude',
    'longitude',
    'gps_height'
]

region = [
    'region',
    'lga',
    'ward'
]

water_table = [
    'basin',
    'quality_group',
    'quantity',
    'source_class'
    
]

waterpoint_cat = [
    'extraction_type_class',
    'waterpoint_type_group',
]

waterpoint_num = [
    'amount_tsh',
    'population'
]

management = [
    'management_group',
    'management'
]

construction = ['construction_year']

cat_vars = water_table + waterpoint_cat + management + region

num_vars = gps + waterpoint_num + construction

drop_vars = list(set(X.columns).difference(set(num_vars + cat_vars)))

# Building Preprocessor

In [6]:
preprocessor = ColumnTransformer(transformers=[
            ('numericalPreprocessor', StandardScaler(), num_vars),
            ('categoricalPreprocessor', OneHotEncoder(handle_unknown='ignore'), cat_vars),
            ('dropProcessor', 'drop', drop_vars)
])

# Building Pipeline

In [33]:
pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('featureSelector', VarianceThreshold()),
        ('classifier', OneVsRestClassifier(estimator=XGBClassifier(max_depth=14)))
    ]
)

# Building Parameter Grid

In [34]:
parameter_grid = [
    {
        'featureSelector__threshold': [0.8*0.2, 0.85*0.15, 0.9*0.1]
    }
]

# Instantiate Grid Search

In [35]:
grid_search = GridSearchCV(estimator=pipeline, 
                           param_grid=parameter_grid, 
                           scoring='accuracy', 
                           cv=5, 
                           verbose=2, 
                           n_jobs=-1
                          )

# Fit Grid Search

In [36]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  7.8min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('numericalPreprocessor',
                                                                         StandardScaler(),
                                                                         ['latitude',
                                                                          'longitude',
                                                                          'gps_height',
                                                                          'amount_tsh',
                                                                          'population',
                                                                          'construction_year']),
                                                                        ('categoricalPreprocessor',
                                                                         OneHotEncoder(handle_unknown=

# Return Best Training Accuracy Score from Grid Search

In [37]:
grid_search.best_score_

0.7992517770295547

In [38]:
# Review Grid Search Cross Validation Results

In [39]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_featureSelector__threshold,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,113.212041,1.178849,2.794629,0.288026,0.16,{'featureSelector__threshold': 0.1600000000000...,0.786663,0.792368,0.78844,0.791152,0.789375,0.7896,0.002005,3
1,118.679739,3.153017,2.521652,0.176669,0.1275,{'featureSelector__threshold': 0.1275},0.785728,0.792742,0.789001,0.7919,0.789749,0.789824,0.002461,2
2,115.555031,16.009939,1.948438,0.496872,0.09,{'featureSelector__threshold': 0.0900000000000...,0.795642,0.802282,0.798447,0.801814,0.798073,0.799252,0.002483,1


In [40]:
grid_search.score(X_test, y_test)

0.8067340067340067

# Base Model

In [41]:
base_model = grid_search.best_estimator_

# Import Validation Data

In [42]:
X_validate = pd.read_csv('../data/testing_features.csv', index_col='id')

In [43]:
y_validate = base_model.predict(X_validate)

In [44]:
df_predictions = pd.DataFrame(y_validate, index=X_validate.index, columns=['status_group'])

In [45]:
df_predictions.head()

Unnamed: 0_level_0,status_group
id,Unnamed: 1_level_1
50785,functional
51630,functional
17168,functional
45559,non functional
49871,functional


In [46]:
df_predictions.to_csv('../predictions/base_model2.csv')