In [1]:
#Basics
import pandas as pd
import numpy as np

#Plotting
import matplotlib.pyplot as plt
import seaborn as sns

#Train Test Split
from sklearn.model_selection import train_test_split

# Imputer
from sklearn.impute import SimpleImputer

# Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Classifiers
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier

#Pipeline
from sklearn.pipeline import Pipeline

#Grid Search
from sklearn.model_selection import GridSearchCV

# Model evaluation
from sklearn.metrics import plot_confusion_matrix

#Set Random State
random_state = 42

# Import Data

In [2]:
features = pd.read_csv('../data/training_features.csv', index_col='id')
targets = pd.read_csv('../data/training_labels.csv', index_col='id')
df = features.join(targets, how='left')
X = df.drop('status_group', axis=1)
y = df['status_group']

# Test Train Split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state)

# Classifying Variables

In [4]:
for col in X_train.columns:
    col_dtype_list = [type(val) for val in X_train[col]]
    col_dtype_set = set(col_dtype_list)
    if len(col_dtype_set) > 1:
        print(col, col_dtype_set)

funder {<class 'float'>, <class 'str'>}
installer {<class 'float'>, <class 'str'>}
subvillage {<class 'float'>, <class 'str'>}
public_meeting {<class 'float'>, <class 'bool'>}
scheme_management {<class 'float'>, <class 'str'>}
scheme_name {<class 'float'>, <class 'str'>}
permit {<class 'float'>, <class 'bool'>}


In [5]:
def classify_columns(df, drop_cols):
    """Takes a dataframe and a list of columns to drop and returns:
        - cat_cols: A list of categorical columns.
        - num_cols: A list of numerical columns.
    """
    cols = df.columns
    keep_cols = [col for col in cols if col not in drop_cols]
    cat_cols = []
    num_cols = []
    for col in keep_cols:
        if df[col].dtype == object:
            cat_cols.append(col)
        else:
            num_cols.append(col)
    return cat_cols, num_cols

In [6]:
drop_cols = ['public_meeting', 'permit']
cat_cols, num_cols = classify_columns(X_train, drop_cols)

# Building Preprocessor

In [7]:
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('standardizer', OneHotEncoder(handle_unknown='ignore', dtype=float))
])

numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('standardizer', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('numericalPreprocessor', numerical_pipeline, num_cols),
        ('categoricalPreprocessor', categorical_pipeline, cat_cols),
        ('dropPreprocessor', 'drop', drop_cols)
    ])

# Building Pipeline

In [8]:
pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', OneVsRestClassifier(estimator='passthrough'))
    ]
)

In [9]:
preprocessor.fit_transform(X_train)

<53460x61572 sparse matrix of type '<class 'numpy.float64'>'
	with 1978020 stored elements in Compressed Sparse Row format>

# Building Parameter Grid

In [10]:
parameter_grid = [
    {
        'classifier__estimator': [XGBClassifier()],
        'classifier__estimator__max_depth': [20],
        'classifier__estimator__n_estimators': [200, 250, 300]
    }
]

# Instantiate Grid Search

In [11]:
grid_search = GridSearchCV(estimator=pipeline, param_grid=parameter_grid, scoring='accuracy', cv=5, verbose=2, n_jobs=-1)

# Fit Grid Search

In [12]:
grid_search.fit(X, y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 139.7min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('numericalPreprocessor',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('standardizer',
                                                                                          StandardScaler())]),
                                                                         ['amount_tsh',
                                                                          'gps_height',
                                                                          'longitude',
                                                                          'latitude',
                                   

# Return Best Training Accuracy Score from Grid Search

In [23]:
grid_search.best_score_

0.8154040404040405

In [24]:
# Review Grid Search Cross Validation Results

In [25]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__estimator,param_classifier__estimator__max_depth,param_classifier__estimator__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1819.809786,15.454137,5.799799,0.083478,"XGBClassifier(max_depth=20, n_estimators=200)",20,200,{'classifier__estimator': XGBClassifier(max_de...,0.820202,0.815657,0.817003,0.81229,0.811869,0.815404,0.003093,1
1,2223.276941,21.538271,6.76622,0.374251,"XGBClassifier(max_depth=20, n_estimators=200)",20,250,{'classifier__estimator': XGBClassifier(max_de...,0.820118,0.814983,0.817003,0.812121,0.811869,0.815219,0.0031,2
2,2183.163614,385.492551,5.385731,1.457936,"XGBClassifier(max_depth=20, n_estimators=200)",20,300,{'classifier__estimator': XGBClassifier(max_de...,0.820286,0.814394,0.816414,0.812121,0.811616,0.814966,0.003164,3


# Base Model

In [26]:
base_model = grid_search.best_estimator_

# Import Validation Data

In [27]:
X_validate = pd.read_csv('../data/testing_features.csv', index_col='id')

In [28]:
y_validate = base_model.predict(X_validate)

In [29]:
df_predictions = pd.DataFrame(y_validate, index=X_validate.index, columns=['status_group'])

In [30]:
df_predictions.head()

Unnamed: 0_level_0,status_group
id,Unnamed: 1_level_1
50785,functional
51630,functional
17168,functional
45559,non functional
49871,functional


In [31]:
df_predictions.to_csv('../predictions/base_model6.csv')