In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
features = pd.read_csv('../data/training_features.csv', index_col='id')
targets = pd.read_csv('../data/training_labels.csv', index_col='id')
df = features.join(targets, how='left')

In [None]:
# Pipline
from imblearn.pipeline import Pipeline

# Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer

# Dimensionality Reduction 
from sklearn.decomposition import PCA
from sklearn.cluster import FeatureAgglomeration

# Resampler
from imblearn.over_sampling import SMOTE

# Model evaluation
from sklearn.metrics import plot_confusion_matrix

# Base estimator
from sklearn.base import BaseEstimator

class DummyEstimator(BaseEstimator):
    pass

# Variables
gps = [
    'latitude',
    'longitude',
    'gps_height'
]

region = [
    'region',
    'lga',
    'ward',
    'subvillage'
]

water_table = [
    'basin',
    'quality_group',
    'quantity',
    'source_class'
    
]

waterpoint_quant = [
    'extraction_type_class',
    'water_point_type_group',
]

waterpoint_qual = [
    'amount_tsh',
    'population'
]

management = [
    'management_group',
    'management'
]

construction = ['construction_year']

pass_features = []

drop_features = [
    'region_code',
    'district_code',
    'quantity_group',
    'water_quality',
    'source_type',
    'source',
    'extraction_type_group',
    'extraction_type',
    'waterpoint_type',
    'scheme_name',
    'scheme_management',
    'payment_type',
    'installer',
    'funder'
]



pipeline = Pipeline(
    steps=[
        ('preprocessor', ColumnTransformer(transformers=[
            ('numericalPreprocessor', StandardScaler(), numerical_features),
            ('categoricalPreprocessor', OneHotEncoder(drop='first', dtype=int), categorical_features),
            ('passProcessor', 'passthrough', pass_features),
            ('dropProcessor', 'drop', drop_features)
        ])),
        ('dimensionReduction', PCA()),
        ('resampler', SMOTE(random_state=random_state)),
        ('classifier', 'passthrough')
    ]
)

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
# Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from xgboost import XGBClassifier
# Scoriing
from sklearn.metrics import f1_score
parameter_grid = [
    {
        'dimensionReduction__n_components': [None, 8, 6, 5, 4, 3],
        'classifier': [XGBClassifier()],
        'classifier__max_depth': [1, 2, 3],
        'classifier__n_classifiers': [150, 100]
    }
]
grid_search = GridSearchCV(pipeline, parameter_grid, n_jobs=-1, scoring='roc_auc')