# Tabular Playground Series -- February 2022

## Import Training Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from xgboost import XGBClassifier

seed = 3165
bacteria_data = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv', index_col='row_id')

In [2]:
le = LabelEncoder()
X = bacteria_data.drop(columns='target')
MAX = np.max(np.abs(np.array(X)))
y = le.fit_transform(bacteria_data.target)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=seed)

## Custom Transform

In [3]:
def binned_scaler_transform(X, y=None, *args, **kwargs):
    return X / MAX

def binned_scaler_inv_transform(X, y=None, *args, **kwargs):
    return X * MAX

## Find Best Model

In [4]:
params = {'use_label_encoder': False,
          'random_state': seed,
          'tree_method': 'gpu_hist',
#           ''
         }
xgb_model = XGBClassifier(**params)
pipe = Pipeline([
    ('scaler', FunctionTransformer(func=binned_scaler_transform, inverse_func=binned_scaler_inv_transform)),
    ('model', xgb_model)
])

In [5]:
param_grid = {
    'model__n_estimators': [5000],
    'model__max_depth': [5, 6, 7],
    'model__learning_rate': [0.1, 0.25, 0.5],
#     'model__subsample': [0.5, 1],
#     'model__gamma': [0, 0.25],
#     'model__reg_lambda': [0, 5]
}
fit_params = {
    'model__eval_metric': 'mlogloss'
}
grid = GridSearchCV(pipe, param_grid=param_grid,
                    scoring='accuracy', n_jobs=-1,
                    cv=5, verbose=1, refit=True)
grid.fit(X_train, y_train, **fit_params)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed: 276.1min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler',
                                        FunctionTransformer(func=<function binned_scaler_transform at 0x7fdb33e49f80>,
                                                            inverse_func=<function binned_scaler_inv_transform at 0x7fdb33a230e0>)),
                                       ('model',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      enable_categorical=False,
                                                      gamma=No...
                                                      num_parallel_tree=None,
                                   

In [6]:
print(f'Best Params: {grid.best_params_}')
print(f'Best Score: {grid.best_score_}')
print(f'Train Score: {grid.score(X_train, y_train)}')
print(f'Test Score: {grid.score(X_test, y_test)}')

Best Params: {'model__learning_rate': 0.1, 'model__max_depth': 6, 'model__n_estimators': 5000}
Best Score: 0.9914466666666668
Train Score: 1.0
Test Score: 0.99426


## Train on Complete Dataset and Predict Test Values

In [7]:
best_xgbm = grid.best_estimator_['model']
best_xgbm.fit(X,y)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=0, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=5000, n_jobs=2,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=3165, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=None, subsample=1, tree_method='gpu_hist',
              use_label_encoder=False, validate_parameters=1, verbosity=None)

In [8]:
bacteria_test_data = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv', index_col='row_id')
bacteria_test_data['target'] = le.inverse_transform(best_xgbm.predict(bacteria_test_data))
bacteria_test_data['target'].to_csv(f'./submission.csv')