In [None]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import log_loss
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
import optuna

In [None]:
class OHE(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder = OneHotEncoder(handle_unknown='ignore')
    
    def fit(self, x, y=None):
        self.categorical_features = list(x.loc[:, x.dtypes == object])
        self.encoder.fit(x[self.categorical_features])

    def transform(self, x):
        transformed = self.encoder.transform(x[self.categorical_features])
        ohe = pd.DataFrame(transformed.toarray(), columns=self.encoder.get_feature_names_out())
        x = x.reset_index()
        x = pd.concat([x, ohe], axis=1)
        x = x.drop(columns=self.categorical_features)
        x = x.drop(columns=['index'])
        return x
    
    def fit_transform(self, x, y=None):
        self.fit(x)
        return self.transform(x)

In [None]:
class Imputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.numeric_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
        self.cat_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    
    def fit(self, x, y=None):
        self.numeric_features = list(x.loc[:, x.dtypes != object])
        self.cat_features = list(x.loc[:, x.dtypes == object])
        self.numeric_imputer.fit(x[self.numeric_features])
        self.cat_imputer.fit(x[self.cat_features])

    def transform(self, x):
        num = pd.DataFrame(self.numeric_imputer.transform(x[self.numeric_features]),
                           columns=self.numeric_features)
        cat = pd.DataFrame(self.cat_imputer.transform(x[self.cat_features]),
                           columns=self.cat_features)
        num.reset_index(drop=True, inplace=True)
        cat.reset_index(drop=True, inplace=True)
        x = pd.concat([num, cat], axis=1)
        return x
    
    def fit_transform(self, x, y=None):
        self.fit(x)
        x = self.transform(x)
        return x

In [None]:
class Model(BaseEstimator, ClassifierMixin):
    def __init__(self, model_class, params=None):
        self.model_class = model_class
        self.params = params
        if params is None:
            self.model = model_class()
        else:
            self.model = model_class(**params)
        
    def fit(self, x, y):
        self.classes_ = np.unique(y)
        self.model.fit(x, y)

    def predict_proba(self, x):
        return self.model.predict_proba(x)
    
    def predict(self, x):
        preds = self.model.predict_proba(x)
        return np.argmax(preds, axis=1)

In [None]:
class Tuner():
    def __init__(self, model_class):
        self.imputer = Imputer()
        self.encoder = OHE()
        self.model_class = model_class
        self.model = Model(self.model_class)
        
    def fit(self, x, y):
        x = self.imputer.fit_transform(x)
        x = self.encoder.fit_transform(x)
        self.model.fit(x, y)
    
    def predict_proba(self, x, val=False):
        if val:
            x = self.imputer.transform(x)
            x = self.encoder.transform(x)
            
        return self.model.predict_proba(x)
    
    def predict(self, x, val=False):
        if val:
            x = self.imputer.transform(x)
            x = self.encoder.transform(x)
            
        return self.model.predict(x)
    
    def loss(self, y_true, y_pred):
        return log_loss(y_true, y_pred) 
    
    def tune(self, x, y, n_trails, params):
        study = optuna.create_study(direction="minimize")
        study.optimize(lambda trial: self.objective(trial, x, y, params), n_trials=n_trails)
        self.model = self.model_class(study.best_trial.params)
        return study.best_trial.params
    
    def objective(self, trial, x, y, params):
        trial_params = {}
        for val, key in zip(params.values(), params.keys()):
            if val['type'] == 'float':
                trial_params[key] = trial.suggest_float(key, val['min'], val['max'])
            elif val['type'] == 'int':
                trial_params[key] = trial.suggest_int(key, val['min'], val['max']) 
            elif val['type'] == 'str':
                trial_params[key] = trial.suggest_categorical(key, val['list'])
         
        self.model = Model(self.model_class, params=trial_params)
        loss = []
        kf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
        for train_idx, val_idx in kf.split(x, y):
            x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            self.fit(x_train, y_train)
            preds = self.predict_proba(x_val, val=True)
            val_loss = self.loss(y_val, preds)
            loss.append(val_loss)

        return np.array(loss).mean()

In [None]:
df_x = pd.read_csv('/kaggle/input/pump-it-up-challenge-driven-data/training_Set_values.csv')
df_x = df_x.sort_values(by=['id'], ascending=True)
df_y = pd.read_csv('/kaggle/input/pump-it-up-challenge-driven-data/training_Set_labels.csv')
df_y = df_y.sort_values(by=['id'], ascending=True)['status_group']
pd.concat([df_x.nunique(axis=0), df_x.isna().sum(axis=0), df_x.dtypes], axis=1).rename(columns={0: 'uniques', 1:'na', 2:'type'})

In [None]:
df_x['district_code'] = df_x['district_code'].astype(object)
df_x['region_code'] = df_x['region_code'].astype(object)
df_x['date_recorded'] = pd.to_datetime(df_x['date_recorded'])
df_x['day'] = df_x['date_recorded'].dt.day
df_x['month'] = df_x['date_recorded'].dt.month
df_x['year'] = df_x['date_recorded'].dt.year
df_x = df_x.drop(columns=['date_recorded', 'id', 'wpt_name', 'installer', 'funder', 'subvillage', 'ward', 'scheme_name', 'recorded_by'])

missing_dict = {'gps_height': 0,
                'longitude': 0, 
                'construction_year': 0,
                'latitude': -2.00E-08,
                'scheme_management': 'None',
                'management': 'unknown',
                'management_group': 'unknown',
                'payment': 'unknown',
                'payment_type': 'unknown',
                'water_quality': 'unknown',
                'quality_group': 'unknown',
                'quantity': 'unknown',
                'quantity_group': 'unknown',
                'source': 'unknown',
                'source_class': 'unknown'}

for feature, val in zip(missing_dict.keys(), missing_dict.values()):
    df_x[feature] = df_x[feature].replace(val, np.nan)

label_encoder = LabelEncoder()
label_encoder.fit(df_y)
df_y = label_encoder.transform(df_y)

x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.1, random_state=1)

In [None]:
xgb_tuner = Tuner(XGBClassifier) 
trial_params = {'subsample': {'type': 'float', 'min': 0.3, 'max': 1.0},
                'colsample_bytree': {'type': 'float', 'min': 0.3, 'max': 1.0},
                'alpha': {'type': 'float', 'min': 1e-8, 'max': 1e-4},
                'reg_lambda': {'type': 'float', 'min': 1e-4, 'max': 1.0},
                'min_child_weight': {'type': 'float', 'min': 1e-4, 'max': 1.0},
                'gamma': {'type': 'float', 'min': 1e-8, 'max': 1e-4},
                'learning_rate': {'type': 'float', 'min': 0.01, 'max': 0.3},
                'max_depth': {'type': 'int', 'min': 5, 'max': 20}, 
                'n_estimators': {'type': 'int', 'min': 50, 'max': 100}}
params = xgb_tuner.tune(x_train, y_train, 50, trial_params)
params

In [None]:
params = {'subsample': 0.8801220003113425,
          'colsample_bytree': 0.6240943927284269,
          'alpha': 4.6031061008851576e-05,
          'reg_lambda': 0.908759702721212,
          'min_child_weight': 0.9183609506762782,
          'gamma': 8.65759106718364e-05,
          'learning_rate': 0.1349792253295306,
          'max_depth': 14,
          'n_estimators': 95}
model = Model(XGBClassifier, params)
pipe = Pipeline([('imputer', Imputer()), ('ohe', OHE()), ('model', model)])
pipe.fit(x_train, y_train)
preds = pipe.predict(x_test)
(preds == y_test).mean()