In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
random_state = 42

In [2]:
from sklearn.model_selection import train_test_split
features = pd.read_csv('../data/training_features.csv', index_col='id')
targets = pd.read_csv('../data/training_labels.csv', index_col='id')
df = features.join(targets, how='left')
X = df.drop('status_group', axis=1)
y = df['status_group']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state)

In [41]:
# Variables
gps = [
    'latitude',
    'longitude',
    'gps_height'
]

region = [
    'region',
    'lga',
    'ward'
]

water_table = [
    'basin',
    'quality_group',
    'quantity',
    'source_class'
    
]

waterpoint_cat = [
    'extraction_type_class',
    'waterpoint_type_group',
]

waterpoint_num = [
    'amount_tsh',
    'population'
]

management = [
    'management_group',
    'management'
]

construction = ['construction_year']

cat_vars = water_table + waterpoint_cat + management

num_vars = gps + waterpoint_num

drop_vars = list(set(X.columns).difference(set(num_vars + cat_vars)))
drop_vars

['construction_year',
 'public_meeting',
 'quantity_group',
 'funder',
 'installer',
 'district_code',
 'payment_type',
 'extraction_type_group',
 'water_quality',
 'scheme_management',
 'region_code',
 'source',
 'permit',
 'wpt_name',
 'payment',
 'source_type',
 'recorded_by',
 'subvillage',
 'date_recorded',
 'ward',
 'waterpoint_type',
 'region',
 'num_private',
 'lga',
 'scheme_name',
 'extraction_type']

In [42]:
# Pipline
from imblearn.pipeline import Pipeline

# Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer

# Dimensionality Reduction 
from sklearn.decomposition import PCA
from sklearn.cluster import FeatureAgglomeration

# Resampler
from imblearn.over_sampling import SMOTE

# Model evaluation
from sklearn.metrics import plot_confusion_matrix

# Base estimator
from sklearn.base import BaseEstimator

class DummyEstimator(BaseEstimator):
    pass




pipeline = Pipeline(
    steps=[
        ('preprocessor', ColumnTransformer(transformers=[
            ('numericalPreprocessor', StandardScaler(), num_vars),
            ('categoricalPreprocessor', OneHotEncoder(drop='first', dtype=int), cat_vars),
            ('dropProcessor', 'drop', drop_vars)
        ])),
        #('dimensionReduction', PCA()),
        #('resampler', SMOTE(random_state=random_state)),
        ('classifier', OneVsRestClassifier(estimator='passthrough'))
    ]
)

In [50]:
import numpy as np
from sklearn.model_selection import GridSearchCV
# Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier
# Scoriing
from sklearn.metrics import f1_score
parameter_grid = [
    {
        #'dimensionReduction__n_components': [None, 8, 6, 5, 4, 3],
        'classifier__estimator': [XGBClassifier()],
        'classifier__max_depth': [1, 2],
        #'classifier__n_classifiers': [150, 100]
    }
]
grid_search = GridSearchCV(pipeline, parameter_grid, scoring='accuracy')

In [51]:
grid_search.fit(X_train, y_train)

ValueError: Invalid parameter max_depth for estimator OneVsRestClassifier(estimator=XGBClassifier()). Check the list of available parameters with `estimator.get_params().keys()`.

In [45]:
grid_search.score(X_train, y_train)

0.7324354657687991