# Imports and Data overview

In [1]:
train_url = 'https://raw.githubusercontent.com/sidt-ai/data-science-competitions/main/machinehack/ode_to_code_2022/data/raw/train.csv'
test_url = 'https://raw.githubusercontent.com/sidt-ai/data-science-competitions/main/machinehack/ode_to_code_2022/data/raw/test.csv'
sub_url = 'https://raw.githubusercontent.com/sidt-ai/data-science-competitions/main/machinehack/ode_to_code_2022/data/raw/sample_submission.csv'

In [2]:
!pip install --upgrade --quiet optuna

In [3]:
!pip install --quiet xgboost==1.5

In [4]:
import time 

import numpy as np
from scipy.stats import mode
import pandas as pd
pd.set_option("precision", 4)

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif 
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold

import optuna
from optuna.samplers import TPESampler
from optuna.integration import XGBoostPruningCallback
from xgboost import XGBClassifier

SEED = 2311

In [5]:
train = pd.read_csv(train_url)
test = pd.read_csv(test_url)

In [6]:
train.head()

Unnamed: 0,edible-poisonous,cap-diameter,cap-shape,cap-color,does-bruise-or-bleed,gill-attachment,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,p,5.75,x,n,t,s,w,6.13,14.93,w,f,f,d,a
1,p,2.27,o,g,f,f,f,0.0,0.0,f,f,f,d,s
2,p,5.59,c,y,t,x,p,5.47,11.27,w,f,f,d,a
3,p,0.74,x,p,f,a,n,4.33,0.98,k,f,f,d,a
4,p,9.6,x,n,f,a,w,10.84,28.52,w,f,f,d,u


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42748 entries, 0 to 42747
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   edible-poisonous      42748 non-null  object 
 1   cap-diameter          42748 non-null  float64
 2   cap-shape             42748 non-null  object 
 3   cap-color             42748 non-null  object 
 4   does-bruise-or-bleed  42748 non-null  object 
 5   gill-attachment       35808 non-null  object 
 6   gill-color            42748 non-null  object 
 7   stem-height           42748 non-null  float64
 8   stem-width            42748 non-null  float64
 9   stem-color            42748 non-null  object 
 10  has-ring              42748 non-null  object 
 11  ring-type             41029 non-null  object 
 12  habitat               42748 non-null  object 
 13  season                42748 non-null  object 
dtypes: float64(3), object(11)
memory usage: 4.6+ MB


In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18321 entries, 0 to 18320
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   edible-poisonous      18321 non-null  object 
 1   cap-diameter          18321 non-null  float64
 2   cap-shape             18321 non-null  object 
 3   cap-color             18321 non-null  object 
 4   does-bruise-or-bleed  18321 non-null  object 
 5   gill-attachment       15377 non-null  object 
 6   gill-color            18321 non-null  object 
 7   stem-height           18321 non-null  float64
 8   stem-width            18321 non-null  float64
 9   stem-color            18321 non-null  object 
 10  has-ring              18321 non-null  object 
 11  ring-type             17569 non-null  object 
 12  habitat               18321 non-null  object 
 13  season                0 non-null      float64
dtypes: float64(4), object(10)
memory usage: 2.0+ MB


In [9]:
test.drop(['season'], axis=1, inplace=True)

# Missing values

In [10]:
train.isna().sum()

edible-poisonous           0
cap-diameter               0
cap-shape                  0
cap-color                  0
does-bruise-or-bleed       0
gill-attachment         6940
gill-color                 0
stem-height                0
stem-width                 0
stem-color                 0
has-ring                   0
ring-type               1719
habitat                    0
season                     0
dtype: int64

In [11]:
test.isna().sum()

edible-poisonous           0
cap-diameter               0
cap-shape                  0
cap-color                  0
does-bruise-or-bleed       0
gill-attachment         2944
gill-color                 0
stem-height                0
stem-width                 0
stem-color                 0
has-ring                   0
ring-type                752
habitat                    0
dtype: int64

In [12]:
train['gill-attachment'].unique(), test['gill-attachment'].unique()

(array(['s', 'f', 'x', 'a', 'e', nan, 'd', 'p'], dtype=object),
 array(['x', 'a', nan, 'd', 'e', 'p', 's', 'f'], dtype=object))

In [13]:
train['ring-type'].unique(), test['ring-type'].unique()

(array(['f', 'p', nan, 'l', 'z', 'g', 'm', 'e', 'r'], dtype=object),
 array([nan, 'l', 'f', 'm', 'e', 'z', 'r', 'p', 'g'], dtype=object))

According to the data description, 'f' corresponds to 'none' for both these features with missing values.  

Thus, we will fill missing values with 'f'.

In [14]:
train.fillna('f', axis=1, inplace=True)
test.fillna('f', axis=1, inplace=True)

# EDA and Feature Engineering

In [15]:
train['season'].value_counts()

a    21103
u    16055
w     3698
s     1892
Name: season, dtype: int64

In [16]:
features = [f for f in train.columns if f not in ('season', 'fold')]
num_features = ['cap-diameter', 'stem-height', 'stem-width']
cat_features = [f for f in features if f not in num_features]
cat_mask = [f in cat_features for f in features]

In [17]:
train[cat_features].describe()

Unnamed: 0,edible-poisonous,cap-shape,cap-color,does-bruise-or-bleed,gill-attachment,gill-color,stem-color,has-ring,ring-type,habitat
count,42748,42748,42748,42748,42748,42748,42748,42748,42748,42748
unique,2,7,12,2,7,12,13,2,8,8
top,p,x,n,f,f,w,w,f,f,d
freq,23707,18884,16940,35335,9373,12930,16014,32138,35583,30934


In [18]:
train[num_features].describe()

Unnamed: 0,cap-diameter,stem-height,stem-width
count,42748.0,42748.0,42748.0
mean,6.712,6.5804,12.134
std,5.2038,3.3684,10.0589
min,0.38,0.0,0.0
25%,3.48,4.64,5.19
50%,5.84,5.95,10.14
75%,8.54,7.74,16.53
max,62.34,33.92,103.91


In [19]:
train[cat_features] = train[cat_features].astype('category')
test[cat_features] = test[cat_features].astype('category')

In [20]:
target_labels = LabelEncoder()
train['season'] = target_labels.fit_transform(train['season'])

# Hyperparameter tuning

In [21]:
base_params = {
    'objective': 'multi:softmax',
    'booster': 'gbtree',
    'use_label_encoder': False,
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'enable_categorical': True,
    'n_estimators': 5000,
    'eval_metric': 'mlogloss',
    'random_state': SEED,
    'verbosity': 0
}

In [22]:
def objective(trial, xtrain, ytrain, xval, yval, base_params):
  param_grid = {
    'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
    'subsample': trial.suggest_float('subsample', 0.5, 1.0),
    'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
    'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
    'max_depth': trial.suggest_int('max_depth', 3, 12)
  }

  model = XGBClassifier(**base_params, **param_grid)
    
  model.fit(
    xtrain, ytrain,
    eval_set=[(xval, yval)],
    early_stopping_rounds=100,
    callbacks=[
      XGBoostPruningCallback(trial, 'validation_0-mlogloss')
    ],
    verbose=100
  )
    
  val_predictions = model.predict(xval)
    
  return accuracy_score(yval, val_predictions)

In [23]:
xtrain, xval, ytrain, yval = train_test_split(train[features], train['season'],
                                              test_size=0.2,
                                              stratify=train['season'],
                                              shuffle=True,
                                              random_state=SEED)

In [24]:
study = optuna.create_study(direction='minimize', 
                            sampler=TPESampler(), 
                            study_name='MH_ode_to_code')

[32m[I 2022-01-16 10:44:05,260][0m A new study created in memory with name: MH_ode_to_code[0m


In [25]:
%%time
study.optimize(
    lambda trial: objective(trial, xtrain, ytrain, xval, yval, base_params),
    n_trials=50
)

[0]	validation_0-mlogloss:1.33675
[100]	validation_0-mlogloss:0.86464
[200]	validation_0-mlogloss:0.81330
[300]	validation_0-mlogloss:0.79058
[400]	validation_0-mlogloss:0.77745
[500]	validation_0-mlogloss:0.77128
[600]	validation_0-mlogloss:0.76786
[700]	validation_0-mlogloss:0.76606
[800]	validation_0-mlogloss:0.76556
[900]	validation_0-mlogloss:0.76487
[1000]	validation_0-mlogloss:0.76464
[1100]	validation_0-mlogloss:0.76543
[1106]	validation_0-mlogloss:0.76547


[32m[I 2022-01-16 10:44:10,306][0m Trial 0 finished with value: 0.5214035087719299 and parameters: {'learning_rate': 0.11437703735509057, 'subsample': 0.6677718916854287, 'reg_lambda': 2.72715881041597e-05, 'reg_alpha': 2.891775740328728e-06, 'max_depth': 3}. Best is trial 0 with value: 0.5214035087719299.[0m


[0]	validation_0-mlogloss:1.27501
[100]	validation_0-mlogloss:0.77262
[183]	validation_0-mlogloss:0.79619


[32m[I 2022-01-16 10:44:13,435][0m Trial 1 finished with value: 0.5132163742690058 and parameters: {'learning_rate': 0.20901162959275282, 'subsample': 0.649101046910588, 'reg_lambda': 1.3454201242261047e-07, 'reg_alpha': 0.00033639349391350226, 'max_depth': 7}. Best is trial 1 with value: 0.5132163742690058.[0m


[0]	validation_0-mlogloss:1.37665
[100]	validation_0-mlogloss:0.99473
[200]	validation_0-mlogloss:0.91006
[300]	validation_0-mlogloss:0.87165
[400]	validation_0-mlogloss:0.84685
[500]	validation_0-mlogloss:0.82944
[600]	validation_0-mlogloss:0.81581
[700]	validation_0-mlogloss:0.80493
[800]	validation_0-mlogloss:0.79640
[900]	validation_0-mlogloss:0.78974
[1000]	validation_0-mlogloss:0.78412
[1100]	validation_0-mlogloss:0.77957
[1200]	validation_0-mlogloss:0.77570
[1300]	validation_0-mlogloss:0.77280
[1400]	validation_0-mlogloss:0.77047
[1500]	validation_0-mlogloss:0.76853
[1600]	validation_0-mlogloss:0.76709
[1700]	validation_0-mlogloss:0.76561
[1800]	validation_0-mlogloss:0.76441
[1900]	validation_0-mlogloss:0.76350
[2000]	validation_0-mlogloss:0.76289
[2100]	validation_0-mlogloss:0.76245
[2200]	validation_0-mlogloss:0.76212
[2300]	validation_0-mlogloss:0.76150
[2400]	validation_0-mlogloss:0.76142
[2500]	validation_0-mlogloss:0.76116
[2585]	validation_0-mlogloss:0.76120


[32m[I 2022-01-16 10:44:26,867][0m Trial 2 finished with value: 0.5221052631578947 and parameters: {'learning_rate': 0.020346933910704273, 'subsample': 0.5548052754702514, 'reg_lambda': 0.09330878990288788, 'reg_alpha': 4.51821252031945e-08, 'max_depth': 4}. Best is trial 1 with value: 0.5132163742690058.[0m


[0]	validation_0-mlogloss:1.28050
[100]	validation_0-mlogloss:0.76720
[200]	validation_0-mlogloss:0.76284
[262]	validation_0-mlogloss:0.76940


[32m[I 2022-01-16 10:44:28,820][0m Trial 3 finished with value: 0.5218713450292398 and parameters: {'learning_rate': 0.22674287907352655, 'subsample': 0.8871915266122464, 'reg_lambda': 1.4963588358232864e-07, 'reg_alpha': 0.0008466790752709683, 'max_depth': 5}. Best is trial 1 with value: 0.5132163742690058.[0m


[0]	validation_0-mlogloss:1.25401
[100]	validation_0-mlogloss:0.76927
[171]	validation_0-mlogloss:0.79309


[32m[I 2022-01-16 10:44:31,679][0m Trial 4 finished with value: 0.5149707602339181 and parameters: {'learning_rate': 0.25491903645315, 'subsample': 0.903277082264751, 'reg_lambda': 4.8178480600398536e-08, 'reg_alpha': 1.4515753587714553e-08, 'max_depth': 7}. Best is trial 1 with value: 0.5132163742690058.[0m


[0]	validation_0-mlogloss:1.35188


[32m[I 2022-01-16 10:44:31,747][0m Trial 5 pruned. Trial was pruned at iteration 0.[0m


[0]	validation_0-mlogloss:1.25683
[100]	validation_0-mlogloss:0.77680
[153]	validation_0-mlogloss:0.80331


[32m[I 2022-01-16 10:44:35,111][0m Trial 6 finished with value: 0.5134502923976608 and parameters: {'learning_rate': 0.23813109140059327, 'subsample': 0.8587397498403289, 'reg_lambda': 0.08512901966737677, 'reg_alpha': 1.6498278474800564e-07, 'max_depth': 8}. Best is trial 1 with value: 0.5132163742690058.[0m


[0]	validation_0-mlogloss:1.26502


[32m[I 2022-01-16 10:44:38,395][0m Trial 7 pruned. Trial was pruned at iteration 61.[0m
[32m[I 2022-01-16 10:44:38,541][0m Trial 8 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-01-16 10:44:38,600][0m Trial 9 pruned. Trial was pruned at iteration 0.[0m


[0]	validation_0-mlogloss:1.22564


[32m[I 2022-01-16 10:44:40,509][0m Trial 10 pruned. Trial was pruned at iteration 57.[0m
[32m[I 2022-01-16 10:44:40,607][0m Trial 11 pruned. Trial was pruned at iteration 0.[0m


[0]	validation_0-mlogloss:1.24787
[100]	validation_0-mlogloss:0.77831
[153]	validation_0-mlogloss:0.80401


[32m[I 2022-01-16 10:44:44,154][0m Trial 12 finished with value: 0.5182456140350877 and parameters: {'learning_rate': 0.2555026074679013, 'subsample': 0.8545623117968921, 'reg_lambda': 0.9601293582145107, 'reg_alpha': 0.0015856764769632138, 'max_depth': 8}. Best is trial 1 with value: 0.5132163742690058.[0m


[0]	validation_0-mlogloss:1.21635


[32m[I 2022-01-16 10:44:45,948][0m Trial 13 pruned. Trial was pruned at iteration 47.[0m
[32m[I 2022-01-16 10:44:46,023][0m Trial 14 pruned. Trial was pruned at iteration 0.[0m


[0]	validation_0-mlogloss:1.25998
[100]	validation_0-mlogloss:0.77303
[165]	validation_0-mlogloss:0.79505


[32m[I 2022-01-16 10:44:49,521][0m Trial 15 finished with value: 0.5177777777777778 and parameters: {'learning_rate': 0.22887544778614738, 'subsample': 0.987183707025461, 'reg_lambda': 0.0030962455195377332, 'reg_alpha': 0.00038182740997929357, 'max_depth': 8}. Best is trial 1 with value: 0.5132163742690058.[0m


[0]	validation_0-mlogloss:1.28769


[32m[I 2022-01-16 10:44:49,597][0m Trial 16 pruned. Trial was pruned at iteration 0.[0m


[0]	validation_0-mlogloss:1.31583


[32m[I 2022-01-16 10:44:49,701][0m Trial 17 pruned. Trial was pruned at iteration 0.[0m


[0]	validation_0-mlogloss:1.26058


[32m[I 2022-01-16 10:44:49,897][0m Trial 18 pruned. Trial was pruned at iteration 12.[0m


[0]	validation_0-mlogloss:1.25545


[32m[I 2022-01-16 10:44:51,659][0m Trial 19 pruned. Trial was pruned at iteration 48.[0m
[32m[I 2022-01-16 10:44:51,736][0m Trial 20 pruned. Trial was pruned at iteration 0.[0m


[0]	validation_0-mlogloss:1.25187
[100]	validation_0-mlogloss:0.77060
[172]	validation_0-mlogloss:0.79498


[32m[I 2022-01-16 10:44:54,631][0m Trial 21 finished with value: 0.515906432748538 and parameters: {'learning_rate': 0.2593861664160838, 'subsample': 0.9336852058654719, 'reg_lambda': 1.7367666435416482e-08, 'reg_alpha': 6.619480033859005e-08, 'max_depth': 7}. Best is trial 1 with value: 0.5132163742690058.[0m


[0]	validation_0-mlogloss:1.25215
[100]	validation_0-mlogloss:0.78470
[155]	validation_0-mlogloss:0.81457


[32m[I 2022-01-16 10:44:58,523][0m Trial 22 finished with value: 0.5203508771929825 and parameters: {'learning_rate': 0.2458575207227933, 'subsample': 0.8293594093968368, 'reg_lambda': 9.184415447420651e-08, 'reg_alpha': 1.265055850101846e-08, 'max_depth': 8}. Best is trial 1 with value: 0.5132163742690058.[0m
[32m[I 2022-01-16 10:44:58,593][0m Trial 23 pruned. Trial was pruned at iteration 0.[0m


[0]	validation_0-mlogloss:1.27933


[32m[I 2022-01-16 10:44:58,671][0m Trial 24 pruned. Trial was pruned at iteration 0.[0m


[0]	validation_0-mlogloss:1.27662


[32m[I 2022-01-16 10:44:58,747][0m Trial 25 pruned. Trial was pruned at iteration 0.[0m


[0]	validation_0-mlogloss:1.23792
[100]	validation_0-mlogloss:0.78943
[147]	validation_0-mlogloss:0.81665


[32m[I 2022-01-16 10:45:02,372][0m Trial 26 finished with value: 0.5133333333333333 and parameters: {'learning_rate': 0.27533743887856654, 'subsample': 0.8622192622959457, 'reg_lambda': 4.0086956943776287e-07, 'reg_alpha': 4.355154007079121e-06, 'max_depth': 8}. Best is trial 1 with value: 0.5132163742690058.[0m


[0]	validation_0-mlogloss:1.22461


[32m[I 2022-01-16 10:45:04,057][0m Trial 27 pruned. Trial was pruned at iteration 38.[0m


[0]	validation_0-mlogloss:1.28544


[32m[I 2022-01-16 10:45:04,151][0m Trial 28 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-01-16 10:45:04,244][0m Trial 29 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-01-16 10:45:04,314][0m Trial 30 pruned. Trial was pruned at iteration 0.[0m


[0]	validation_0-mlogloss:1.24296


[32m[I 2022-01-16 10:45:05,663][0m Trial 31 pruned. Trial was pruned at iteration 60.[0m


[0]	validation_0-mlogloss:1.27582


[32m[I 2022-01-16 10:45:05,746][0m Trial 32 pruned. Trial was pruned at iteration 0.[0m


[0]	validation_0-mlogloss:1.28644


[32m[I 2022-01-16 10:45:05,819][0m Trial 33 pruned. Trial was pruned at iteration 0.[0m


[0]	validation_0-mlogloss:1.25255


[32m[I 2022-01-16 10:45:05,928][0m Trial 34 pruned. Trial was pruned at iteration 3.[0m


[0]	validation_0-mlogloss:1.22209


[32m[I 2022-01-16 10:45:07,031][0m Trial 35 pruned. Trial was pruned at iteration 43.[0m
[32m[I 2022-01-16 10:45:07,112][0m Trial 36 pruned. Trial was pruned at iteration 0.[0m


[0]	validation_0-mlogloss:1.37143


[32m[I 2022-01-16 10:45:07,196][0m Trial 37 pruned. Trial was pruned at iteration 0.[0m


[0]	validation_0-mlogloss:1.25611


[32m[I 2022-01-16 10:45:09,093][0m Trial 38 pruned. Trial was pruned at iteration 33.[0m
[32m[I 2022-01-16 10:45:09,165][0m Trial 39 pruned. Trial was pruned at iteration 0.[0m


[0]	validation_0-mlogloss:1.22970


[32m[I 2022-01-16 10:45:10,286][0m Trial 40 pruned. Trial was pruned at iteration 33.[0m


[0]	validation_0-mlogloss:1.25642


[32m[I 2022-01-16 10:45:10,404][0m Trial 41 pruned. Trial was pruned at iteration 2.[0m


[0]	validation_0-mlogloss:1.24849


[32m[I 2022-01-16 10:45:10,827][0m Trial 42 pruned. Trial was pruned at iteration 19.[0m


[0]	validation_0-mlogloss:1.26127


[32m[I 2022-01-16 10:45:10,909][0m Trial 43 pruned. Trial was pruned at iteration 0.[0m


[0]	validation_0-mlogloss:1.25710


[32m[I 2022-01-16 10:45:11,004][0m Trial 44 pruned. Trial was pruned at iteration 0.[0m


[0]	validation_0-mlogloss:1.33828


[32m[I 2022-01-16 10:45:11,093][0m Trial 45 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-01-16 10:45:11,183][0m Trial 46 pruned. Trial was pruned at iteration 0.[0m


[0]	validation_0-mlogloss:1.29261


[32m[I 2022-01-16 10:45:11,255][0m Trial 47 pruned. Trial was pruned at iteration 0.[0m


[0]	validation_0-mlogloss:1.23048


[32m[I 2022-01-16 10:45:12,071][0m Trial 48 pruned. Trial was pruned at iteration 46.[0m


[0]	validation_0-mlogloss:1.27269


[32m[I 2022-01-16 10:45:12,148][0m Trial 49 pruned. Trial was pruned at iteration 0.[0m


CPU times: user 1min 7s, sys: 1.35 s, total: 1min 8s
Wall time: 1min 6s


In [26]:
print(f'Best value (Accuracy): {study.best_value:.5f}')

best_params = study.best_params
print('Best params:')
for key, value in best_params.items():
    print(f'\t{key}: {value}')

Best value (Accuracy): 0.51322
Best params:
	learning_rate: 0.20901162959275282
	subsample: 0.649101046910588
	reg_lambda: 1.3454201242261047e-07
	reg_alpha: 0.00033639349391350226
	max_depth: 7


# Cross-validation + Inference

In [27]:
N_SPLITS = 5
train['fold'] = -1

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold, (_, val_idx) in enumerate(skf.split(X=train, y=train['season'])):
  train.loc[val_idx, 'fold'] = fold

In [38]:
def custom_cross_val_predict(train, test, features, model):
  oof_preds = {}
  test_preds = []
  scores = []
    
  cv_start = time.time()
    
  for fold in range(N_SPLITS):
    print('-' * 40)
        
    xtrain = train[train.fold != fold].reset_index(drop=True)
    ytrain = xtrain.season

    xval = train[train.fold == fold].reset_index(drop=True)
    yval = xval.season
    val_idx = xval.index.tolist()
    
    fold_start = time.time()
        
    model.fit(
      xtrain[features], ytrain,
      eval_set=[(xval[features], yval)],
      early_stopping_rounds=100,
      verbose=100
    )
    
    val_preds = model.predict(xval[features])
    oof_preds.update(dict(zip(val_idx, val_preds)))
    acc = accuracy_score(yval, val_preds)
    scores.append(acc)
        
    fold_end = time.time()
        
    print(f'Fold #{fold}: Accuracy = {acc:.5f} \
    [Time: {fold_end - fold_start:.2f}s]')
        
    test_preds.append(model.predict(test[features]))
        
    cv_end = time.time()
  
  print(f'Average accuracy = {np.mean(scores):.5f} \
  with std. dev. = {np.std(scores):.5f}')
  
  print(f'[Total time: {cv_end - cv_start:.2f}s]')
    
  oof_preds = pd.DataFrame.from_dict(oof_preds, orient='index').reset_index()
  test_preds = mode(np.column_stack(test_preds), axis=1).mode
    
  return oof_preds, test_preds

In [39]:
model = XGBClassifier(**base_params, **best_params)

In [40]:
oof_preds, test_preds = custom_cross_val_predict(train, test, features, model)

----------------------------------------
[0]	validation_0-mlogloss:1.27623
[100]	validation_0-mlogloss:0.77336
[173]	validation_0-mlogloss:0.79483
Fold #0: Accuracy = 0.51719     [Time: 3.07s]
----------------------------------------
[0]	validation_0-mlogloss:1.27657
[100]	validation_0-mlogloss:0.77231
[174]	validation_0-mlogloss:0.79569
Fold #1: Accuracy = 0.51240     [Time: 2.94s]
----------------------------------------
[0]	validation_0-mlogloss:1.27611
[100]	validation_0-mlogloss:0.77073
[181]	validation_0-mlogloss:0.79287
Fold #2: Accuracy = 0.51357     [Time: 3.05s]
----------------------------------------
[0]	validation_0-mlogloss:1.27627
[100]	validation_0-mlogloss:0.77229
[169]	validation_0-mlogloss:0.79290
Fold #3: Accuracy = 0.51796     [Time: 2.85s]
----------------------------------------
[0]	validation_0-mlogloss:1.27483
[100]	validation_0-mlogloss:0.77258
[174]	validation_0-mlogloss:0.79662
Fold #4: Accuracy = 0.51374     [Time: 2.93s]
Average accuracy = 0.51497   with s

In [43]:
sub1 = pd.read_csv(sub_url)
sub1['season'] = target_labels.inverse_transform(test_preds.ravel())
sub1.to_csv('sub1.csv', index=False)

!head sub1.csv

season
s
u
u
u
u
u
a
u
a
