# Tabular Playground Series -- February 2022

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier, plot_importance

seed = 3165
bacteria_data = pd.read_csv('data/train.csv', index_col='row_id')

  from pandas import MultiIndex, Int64Index


In [2]:
bacteria_data.head()

Unnamed: 0_level_0,A0T0G0C10,A0T0G1C9,A0T0G2C8,A0T0G3C7,A0T0G4C6,A0T0G5C5,A0T0G6C4,A0T0G7C3,A0T0G8C2,A0T0G9C1,...,A8T0G1C1,A8T0G2C0,A8T1G0C1,A8T1G1C0,A8T2G0C0,A9T0G0C1,A9T0G1C0,A9T1G0C0,A10T0G0C0,target
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-9.536743e-07,-1e-05,-4.3e-05,-0.000114,-0.0002,-0.00024,-0.0002,-0.000114,-4.3e-05,-1e-05,...,-8.6e-05,-4.3e-05,-8.6e-05,-8.6e-05,-4.3e-05,-1e-05,-1e-05,-1e-05,-9.536743e-07,Streptococcus_pyogenes
1,-9.536743e-07,-1e-05,-4.3e-05,0.000886,-0.0002,0.00076,-0.0002,-0.000114,-4.3e-05,-1e-05,...,-8.6e-05,-4.3e-05,0.000914,0.000914,-4.3e-05,-1e-05,-1e-05,-1e-05,-9.536743e-07,Salmonella_enterica
2,-9.536743e-07,-2e-06,7e-06,0.000129,0.000268,0.00027,0.000243,0.000125,1e-06,-7e-06,...,8.4e-05,4.8e-05,8.1e-05,0.000106,7.2e-05,1e-05,8e-06,1.9e-05,1.046326e-06,Salmonella_enterica
3,4.632568e-08,-6e-06,1.2e-05,0.000245,0.000492,0.000522,0.000396,0.000197,-3e-06,-7e-06,...,0.000151,0.0001,0.00018,0.000202,0.000153,2.1e-05,1.5e-05,4.6e-05,-9.536743e-07,Salmonella_enterica
4,-9.536743e-07,-1e-05,-4.3e-05,-0.000114,-0.0002,-0.00024,-0.0002,-0.000114,-4.3e-05,-1e-05,...,-8.6e-05,-4.3e-05,-8.6e-05,-8.6e-05,-4.3e-05,-1e-05,-1e-05,-1e-05,-9.536743e-07,Enterococcus_hirae


In [3]:
bacteria_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200000 entries, 0 to 199999
Columns: 287 entries, A0T0G0C10 to target
dtypes: float64(286), object(1)
memory usage: 439.5+ MB


In [4]:
le = LabelEncoder()
X = bacteria_data.drop(columns='target')
y = le.fit_transform(bacteria_data.target)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)

In [5]:
xgb_model = XGBClassifier(use_label_encoder=False, random_state=seed)
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', xgb_model)
])

In [6]:
param_grid = {
    'model__max_depth': [2, 3, 4, 5, 6, 7],
    'model__n_estimators': [10, 100, 1000],
    'model__min_split_loss': [0],
    'model__min_child_weight': [1],
    'model__lambda': [1],
    'model__alpha': [0]
}
grid = GridSearchCV(pipe, param_grid=param_grid,
                    scoring='accuracy', n_jobs=-1,
                    cv=5, verbose=2, refit=True)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      enable_categorical=False,
                                                      gamma=None, gpu_id=None,
                                                      importance_type=None,
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None,
                        

In [7]:
print(f'Best Params: {grid.best_params_}')
print(f'Best Score: {grid.best_score_}')
print(f'Train Score: {grid.score(X_train, y_train)}')
print(f'Test Score: {grid.score(X_test, y_test)}')

Best Params: {'model__max_depth': 2, 'model__n_estimators': 10}
Best Score: 0.6941533333333333
Test Score: 0.6998
[CV] END .........model__max_depth=2, model__n_estimators=10; total time= 3.4min
[CV] END .........model__max_depth=2, model__n_estimators=10; total time= 3.4min
[CV] END .........model__max_depth=2, model__n_estimators=10; total time= 3.4min
[CV] END .........model__max_depth=2, model__n_estimators=10; total time= 3.4min
[CV] END .........model__max_depth=2, model__n_estimators=10; total time=  47.4s
