In [1]:
#Basics
import pandas as pd
import numpy as np

#Plotting
import matplotlib.pyplot as plt
import seaborn as sns

#Train Test Split
from sklearn.model_selection import train_test_split

# Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Classifiers
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier

#Pipeline
from sklearn.pipeline import Pipeline

#Grid Search
from sklearn.model_selection import GridSearchCV

# Model evaluation
from sklearn.metrics import plot_confusion_matrix

#Set Random State
random_state = 42

# Import Data

In [2]:
features = pd.read_csv('../data/training_features.csv', index_col='id')
targets = pd.read_csv('../data/training_labels.csv', index_col='id')
df = features.join(targets, how='left')
X = df.drop('status_group', axis=1)
y = df['status_group']

# Test Train Split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state)

# Classifying Variables

In [11]:
# Variables
gps = [
    'latitude',
    'longitude',
    'gps_height'
]

region = [
    'region',
    'lga',
    'ward'
]

water_table = [
    'basin',
    'quality_group',
    'quantity',
    'source_class'
    
]

waterpoint_cat = [
    'extraction_type_class',
    'waterpoint_type_group',
]

waterpoint_num = [
    'amount_tsh',
    'population'
]

management = [
    'management_group',
    'management'
]

construction = ['construction_year']

cat_vars = water_table + waterpoint_cat + management + region

num_vars = gps + waterpoint_num + construction

drop_vars = list(set(X.columns).difference(set(num_vars + cat_vars)))

# Building Preprocessor

In [5]:
preprocessor = ColumnTransformer(transformers=[
            ('numericalPreprocessor', StandardScaler(), num_vars),
            ('categoricalPreprocessor', OneHotEncoder(handle_unknown='ignore'), cat_vars),
            ('dropProcessor', 'drop', drop_vars)
])

# Building Pipeline

In [6]:
pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', OneVsRestClassifier(estimator='passthrough'))
    ]
)

# Building Parameter Grid

In [8]:
parameter_grid = [
    {
        'classifier__estimator': [XGBClassifier()],
        'classifier__estimator__max_depth': [13, 14, 15]
    }
]

# Instantiate Grid Search

In [9]:
grid_search = GridSearchCV(estimator=pipeline, param_grid=parameter_grid, scoring='accuracy', cv=5, verbose=2, n_jobs=-1)

# Fit Grid Search

In [10]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.5s finished


ValueError: Input contains NaN

# Return Best Training Accuracy Score from Grid Search

In [10]:
grid_search.best_score_

0.800748222970445

In [16]:
# Review Grid Search Cross Validation Results

In [17]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__estimator,param_classifier__estimator__max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,159.688357,1.347292,2.290646,0.032584,XGBClassifier(max_depth=15),13,{'classifier__estimator': XGBClassifier(max_de...,0.791059,0.796857,0.801066,0.800973,0.793678,0.796727,0.003957,3
1,167.065236,5.797104,3.929056,1.297546,XGBClassifier(max_depth=15),14,{'classifier__estimator': XGBClassifier(max_de...,0.793771,0.797138,0.80116,0.802095,0.796296,0.798092,0.003106,2
2,140.285704,22.841015,1.758261,0.513197,XGBClassifier(max_depth=15),15,{'classifier__estimator': XGBClassifier(max_de...,0.794893,0.800599,0.803872,0.805556,0.798822,0.800748,0.003765,1


In [15]:
grid_search.score(X_test, y_test)

0.8035353535353535

# Base Model

In [18]:
base_model = grid_search.best_estimator_

# Import Validation Data

In [24]:
X_validate = pd.read_csv('../data/testing_features.csv', index_col='id')

In [25]:
y_validate = base_model.predict(X_validate)

In [30]:
df_predictions = pd.DataFrame(y_validate, index=X_validate.index, columns=['status_group'])

In [31]:
df_predictions.head()

Unnamed: 0_level_0,status_group
id,Unnamed: 1_level_1
50785,functional
51630,functional
17168,functional
45559,non functional
49871,functional


In [32]:
df_predictions.to_csv('../predictions/base_model.csv')