# Tabular Playground Series -- February 2022

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from xgboost import XGBClassifier

seed = 3165
bacteria_data = pd.read_csv('data/train.csv', index_col='row_id')

  from pandas import MultiIndex, Int64Index


In [2]:
le = LabelEncoder()
X = bacteria_data.drop(columns='target')
MAX = np.max(np.abs(np.array(X)))
y = le.fit_transform(bacteria_data.target)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=seed)

In [3]:
def binned_scaler_transform(X, y=None, *args, **kwargs):
    return X / MAX

def binned_scaler_inv_transform(X, y=None, *args, **kwargs):
    return X * MAX

In [4]:
params = {'use_label_encoder': False,
          'random_state': seed,
          'tree_method': 'gpu_hist'
         }
xgb_model = XGBClassifier(**params)
pipe = Pipeline([
    ('scaler', FunctionTransformer(func=binned_scaler_transform, inverse_func=binned_scaler_inv_transform)),
    ('model', xgb_model)
])

In [5]:
param_grid = {
    'model__n_estimators': [500, 1000, 5000],
    'model__max_depth': [5, 6, 7],
    'model__learning_rate': [0.1, 0.5],
    'model__subsample': [0.5, 1],
    'model__gamma': [0, 0.25],
    'model__reg_lambda': [0, 5]
}
grid = GridSearchCV(pipe, param_grid=param_grid,
                    scoring='accuracy', n_jobs=-1,
                    cv=5, verbose=1, refit=True)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


KeyboardInterrupt: 

In [None]:
print(f'Best Params: {grid.best_params_}')
print(f'Best Score: {grid.best_score_}')
print(f'Train Score: {grid.score(X_train, y_train)}')
print(f'Test Score: {grid.score(X_test, y_test)}')

In [None]:
best_xgbm = grid.best_estimator_['model']
best_xgbm.fit(X,y)

In [None]:
bacteria_test_data = pd.read_csv('data/test.csv', index_col='row_id')
bacteria_test_data['target'] = le.inverse_transform(best_xgbm.predict(bacteria_test_data))
bacteria_test_data['target'].to_csv(f'./submission.csv')