In [None]:
#Basics
import pandas as pd
import numpy as np

#Plotting
import matplotlib.pyplot as plt
import seaborn as sns

#Train Test Split
from sklearn.model_selection import train_test_split

# Imputer
from sklearn.impute import SimpleImputer

# Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Classifiers
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier

#Pipeline
from sklearn.pipeline import Pipeline

#Grid Search
from sklearn.model_selection import GridSearchCV

# Model evaluation
from sklearn.metrics import plot_confusion_matrix

#Set Random State
random_state = 42

# Import Data

In [None]:
features = pd.read_csv('../data/training_features.csv', index_col='id')
targets = pd.read_csv('../data/training_labels.csv', index_col='id')
df = features.join(targets, how='left')
X = df.drop('status_group', axis=1)
y = df['status_group']

# Test Train Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state)

# Classifying Variables

In [None]:
for col in X_train.columns:
    col_dtype_list = [type(val) for val in X_train[col]]
    col_dtype_set = set(col_dtype_list)
    if len(col_dtype_set) > 1:
        print(col, col_dtype_set)

In [None]:
def classify_columns(df, drop_cols):
    """Takes a dataframe and a list of columns to drop and returns:
        - cat_cols: A list of categorical columns.
        - num_cols: A list of numerical columns.
    """
    cols = df.columns
    keep_cols = [col for col in cols if col not in drop_cols]
    cat_cols = []
    num_cols = []
    for col in keep_cols:
        if df[col].dtype == object:
            cat_cols.append(col)
        else:
            num_cols.append(col)
    return cat_cols, num_cols

In [None]:
drop_cols = ['public_meeting', 'permit']
cat_cols, num_cols = classify_columns(X_train, drop_cols)

# Building Preprocessor

In [None]:
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('standardizer', OneHotEncoder(handle_unknown='ignore', dtype=float))
])

numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('standardizer', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('numericalPreprocessor', numerical_pipeline, num_cols),
        ('categoricalPreprocessor', categorical_pipeline, cat_cols),
        ('dropPreprocessor', 'drop', drop_cols)
    ])

# Building Pipeline

In [None]:
pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', OneVsRestClassifier(estimator='passthrough'))
    ]
)

In [None]:
preprocessor.fit_transform(X_train)

# Building Parameter Grid

In [None]:
parameter_grid = [
    {
        'classifier__estimator': [XGBClassifier()],
        'classifier__estimator__max_depth': [15, 20],
        'classifier__estimator__n_estimators': [100, 150, 200]
    }
]

# Instantiate Grid Search

In [None]:
grid_search = GridSearchCV(estimator=pipeline, param_grid=parameter_grid, scoring='accuracy', cv=5, verbose=2, n_jobs=-1)

# Fit Grid Search

In [None]:
grid_search.fit(X, y)

# Return Best Training Accuracy Score from Grid Search

In [None]:
grid_search.best_score_

In [None]:
# Review Grid Search Cross Validation Results

In [None]:
pd.DataFrame(grid_search.cv_results_)

In [None]:
grid_search.score(X_test, y_test)

# Base Model

In [None]:
base_model = grid_search.best_estimator_

# Import Validation Data

In [None]:
X_validate = pd.read_csv('../data/testing_features.csv', index_col='id')

In [None]:
y_validate = base_model.predict(X_validate)

In [None]:
df_predictions = pd.DataFrame(y_validate, index=X_validate.index, columns=['status_group'])

In [None]:
df_predictions.head()

In [None]:
df_predictions.to_csv('../predictions/base_model5.csv')