# Tabular Playground Series -- February 2022

## Import Training Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from xgboost import XGBClassifier

seed = 3165
bacteria_data = pd.read_csv('data/train.csv', index_col='row_id')

  from pandas import MultiIndex, Int64Index


In [2]:
le = LabelEncoder()
X = bacteria_data.drop(columns='target')
MAX = np.max(np.abs(np.array(X)))
y = le.fit_transform(bacteria_data.target)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=1000, test_size=10,
                                                    stratify=y, random_state=seed)

## Custom Transform

In [3]:
def binned_scaler_transform(X, y=None, *args, **kwargs):
    return X / MAX

def binned_scaler_inv_transform(X, y=None, *args, **kwargs):
    return X * MAX

## Find Best Model

In [6]:
params = {'use_label_encoder': False,
          'random_state': seed,
          # 'tree_method': 'gpu_hist',
#           ''
         }
xgb_model = XGBClassifier(**params)
pipe = Pipeline([
    ('scaler', FunctionTransformer(func=binned_scaler_transform, inverse_func=binned_scaler_inv_transform)),
    ('model', xgb_model)
])

In [7]:
param_grid = {
    'model__n_estimators': [5000],
    'model__max_depth': [6],
    'model__learning_rate': [0.1],
    'model__subsample': [0.5],
#     'model__gamma': [0, 0.33, 0.66, 1],
#     'model__reg_lambda': [0, 2.5, 5]
}
fit_params = {
    'model__eval_metric': 'mlogloss'
}
grid = GridSearchCV(pipe, param_grid=param_grid,
                    scoring='accuracy', n_jobs=-1,
                    cv=5, verbose=1, refit=True)
grid.fit(X_train, y_train, **fit_params)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


KeyboardInterrupt: 

In [None]:
print(f'Best Params: {grid.best_params_}')
print(f'Best Score: {grid.best_score_}')
print(f'Train Score: {grid.score(X_train, y_train)}')
print(f'Test Score: {grid.score(X_test, y_test)}')

## Train on Complete Dataset and Predict Test Values

In [None]:
best_xgbm = grid.best_estimator_
best_xgbm.fit(X,y, **fit_params)

In [None]:
bacteria_test_data = pd.read_csv('data/test.csv', index_col='row_id')
bacteria_test_data['target'] = le.inverse_transform(best_xgbm.predict(bacteria_test_data))
bacteria_test_data['target'].to_csv(f'./submission.csv')