# Setup

In [1]:
import gc
import time
import warnings
import subprocess

gc.enable()
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.max_columns', None)

from sklearn.metrics import median_absolute_error
from sklearn.model_selection import KFold
import xgboost as xgb
import optuna

SEED = 55

In [2]:
assert xgb.__version__ == '2.0.1', 'XGBoost version differs from original notebook.' 

In [3]:
#Check GPU availability
try:
    subprocess.check_output('nvidia-smi')
    DEVICE = 'cuda'
except Exception:
    DEVICE = 'cpu'

print(f'Available device: {DEVICE}')

Available device: cuda


# Data preparation

In [4]:
DATA_DIR = '/kaggle/input/playground-series-s3e25'
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

original_train = pd.read_csv(f'/kaggle/input/prediction-of-mohs-hardness-with-machine-learning/jm79zfps6b-1/Mineral_Dataset_Supplementary_Info.csv')
original_val = pd.read_csv(f'/kaggle/input/prediction-of-mohs-hardness-with-machine-learning/jm79zfps6b-1/Artificial_Crystals_Dataset.csv')

In [5]:
# fixing coluumn names
original_val.rename({'Hardness (Mohs)': 'Hardness'}, axis=1, inplace=True)

# dropping irrelevant columns
train.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)
original_train.drop('Unnamed: 0', axis=1, inplace=True)
original_val.drop(['Unnamed: 0', 'Formula', 'Crystal structure'], axis=1, inplace=True)

# fixing column order
column_order = list(train.columns)
original_train = original_train[column_order]
original_val = original_val[column_order]

# combining original train and validation datasets
original = pd.concat([original_train, original_val], axis=0, ignore_index=True)

In [6]:
TARGET = 'Hardness'
features = list(test.columns)

# Baseline

In [7]:
# competition metric
def comp_metric(y_true, y_pred):
    return median_absolute_error(y_true, y_pred)

In [8]:
X, y = train[features], train[TARGET]
oof_preds = {}

cv = KFold(n_splits=7, shuffle=True, random_state=SEED)
for fold, (train_ids, val_ids) in enumerate(cv.split(X, y)):
    X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
    X_val, y_val = X.iloc[val_ids], y.iloc[val_ids]
    
    model = xgb.XGBRegressor(
        n_estimators=1000,
        learning_rate=0.1,
        base_score=0,
        objective='reg:absoluteerror',
        eval_metric=comp_metric,
        early_stopping_rounds=100,
        booster='gbtree',
        tree_method='hist',
        device=DEVICE,
        verbosity=0,
        n_jobs=4,
        random_state=SEED)
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False)
        
    val_preds = model.predict(X_val)
    oof_preds.update(dict(zip(val_ids, val_preds)))
    
    score = comp_metric(y_val, val_preds)
    print(f'Fold #{fold}: {score:.4f}', end = ' | ')        
    _ = gc.collect()
    
oof_preds = pd.Series(oof_preds).sort_index()
print(f'OOF score: {comp_metric(y, oof_preds):.4f}\n')

Fold #0: 0.5530 | Fold #1: 0.5240 | Fold #2: 0.5246 | Fold #3: 0.5139 | Fold #4: 0.4672 | Fold #5: 0.5920 | Fold #6: 0.5972 | OOF score: 0.5373



# Hyperparameter tuning

In [9]:
def objective(trial, feature_set, model, num_folds, seed, extended):
    oof_preds = {}
    X, y = train[feature_set], train[TARGET]
    
    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, step=0.01),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 15),
        'gamma': trial.suggest_float('gamma', 0, 20, step=0.1), #complexity-control
        'alpha': trial.suggest_float('alpha', 0, 5, step=0.05), #L1-reg
        'lambda': trial.suggest_float('lambda', 5e-3, 5e3, log=True), #L2-reg
    }
    
    cv = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        if extended: # original data added only to training folds
            X_train = pd.concat([X_train, original[feature_set]], axis=0, ignore_index=True)
            y_train = pd.concat([y_train, original[TARGET]], axis=0, ignore_index=True)
        
        model.set_params(**param_grid)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0)
        
        val_preds = model.predict(X_val)
        oof_preds.update(dict(zip(val_idx, val_preds)))
        
    oof_preds = pd.Series(oof_preds).sort_index()
    return comp_metric(y, oof_preds)

In [10]:
def tune_params(feature_set, model, num_folds, seed, extended, n_trials, direction):
    study = optuna.create_study(
        sampler=optuna.samplers.TPESampler(
            consider_endpoints=True,
            multivariate=True,
            group=True,
            seed=seed),
        pruner=optuna.pruners.HyperbandPruner(),
        direction=direction
    )
    study.optimize(
        func=lambda trial: objective(
            trial, feature_set, model, num_folds, seed, extended
        ),
        n_trials=n_trials,
        gc_after_trial=True
    )
    return study

# Cross-validation

In [11]:
def cross_validate_predict(feature_set, model, num_folds, seed, extended):
    oof_preds = {}
    test_preds = {}
    scores = []

    X, y = train[feature_set], train[TARGET]
    X_test = test[feature_set]
       
    cv = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        if extended: # original data added only to training folds
            X_train = pd.concat([X_train, original[feature_set]], axis=0, ignore_index=True)
            y_train = pd.concat([y_train, original[TARGET]], axis=0, ignore_index=True)
        
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0)

        val_preds = model.predict(X_val)
        test_preds[f'fold{fold}'] = model.predict(X_test)
        oof_preds.update(dict(zip(val_idx, val_preds)))

        score = comp_metric(y_val, val_preds)
        scores.append(score)
        print(f'Fold #{fold}: {score:.5f} ({model.best_iteration} rounds)')
        _ = gc.collect()
    
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1)
    
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'\nOOF score: {comp_metric(y, oof_preds):.5f}\n')

    return oof_preds, test_preds

In [12]:
def run_experiment(feature_set, num_folds=5, seed=SEED, n_trials=100, extended=False):
    
    base_params = {
        'booster': 'gbtree',
        'tree_method': 'hist',
        'base_score': 0,
#         'objective': 'reg:absoluteerror',
        'objective': 'reg:quantileerror',
        'quantile_alpha': 0.5,
        'n_estimators': 5000,
        'eval_metric': comp_metric,
        'early_stopping_rounds': 50,
        'device': DEVICE,
        'verbosity': 0,
        'n_jobs': -1,
        'seed': seed
    }
    
    model = xgb.XGBRegressor(**base_params)
    
    print(f'----------Hyperparameter tuning----------')
    start = time.time()
    study = tune_params(
        feature_set=feature_set,
        model=model,
        num_folds=num_folds,
        seed=seed,
        extended=extended,
        n_trials=n_trials, 
        direction='minimize' #metric: MedAE -> lower is better
    )
    end = time.time()
    print(f'Best trial: {study.best_trial.number} -> Best value: {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:15} - {v}')
    print(f'\n[Time taken: {end - start:.2f}s]\n')
    
    print(f'-----Cross-validation and prediction-----')
    start = time.time()
    model.set_params(**study.best_params)
    oof_preds, test_preds = cross_validate_predict(
        feature_set, model, num_folds, seed, extended
    )
    end = time.time()
    print(f'[Time taken: {end - start:.2f}s]\n')
    
    return oof_preds, test_preds

In [13]:
def create_submission_files(test_preds, config, notebook='02'):
    for col in test_preds.columns:
        sub = sample_sub.copy()
        sub[TARGET] = test_preds[col].round(4)
        sub.to_csv(f'{notebook}_{config}_{col}.csv', index=False)

In [14]:
op = {} # OOF preds 
tp = {} # Test preds

**Trial run:**

In [15]:
optuna.logging.set_verbosity(optuna.logging.INFO)

In [16]:
_ , _ = run_experiment(feature_set=features, n_trials=5)

[I 2023-11-16 11:09:51,238] A new study created in memory with name: no-name-0cff868c-a376-48d7-af7b-fff825b192e9


----------Hyperparameter tuning----------


[I 2023-11-16 11:10:00,264] Trial 0 finished with value: 0.6261320590972899 and parameters: {'learning_rate': 0.01, 'max_depth': 12, 'min_child_weight': 8, 'gamma': 4.800000000000001, 'alpha': 2.6500000000000004, 'lambda': 0.25836603306052985}. Best is trial 0 with value: 0.6261320590972899.
[I 2023-11-16 11:10:02,121] Trial 1 finished with value: 0.6737160682678223 and parameters: {'learning_rate': 0.09, 'max_depth': 3, 'min_child_weight': 3, 'gamma': 15.4, 'alpha': 0.25, 'lambda': 225.5637136810315}. Best is trial 0 with value: 0.6261320590972899.
[I 2023-11-16 11:10:08,743] Trial 2 finished with value: 0.7018554210662842 and parameters: {'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 13, 'gamma': 18.0, 'alpha': 4.95, 'lambda': 4.7801281415139325}. Best is trial 0 with value: 0.6261320590972899.
[I 2023-11-16 11:10:11,548] Trial 3 finished with value: 0.6601581573486328 and parameters: {'learning_rate': 0.04, 'max_depth': 11, 'min_child_weight': 7, 'gamma': 12.9, 'alpha':

Best trial: 0 -> Best value: 0.62613
Best hyperparameters:
learning_rate   - 0.01
max_depth       - 12
min_child_weight - 8
gamma           - 4.800000000000001
alpha           - 2.6500000000000004
lambda          - 0.25836603306052985

[Time taken: 23.86s]

-----Cross-validation and prediction-----
Fold #0: 0.67571 (297 rounds)
Fold #1: 0.60840 (626 rounds)
Fold #2: 0.54929 (624 rounds)
Fold #3: 0.58701 (571 rounds)
Fold #4: 0.68517 (497 rounds)

OOF score: 0.62613

[Time taken: 9.50s]



In [17]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

# 7-folds

In [18]:
%%time
num_folds = 7
config = f'trn_{num_folds}f'

op[config], tp[config] = run_experiment(
    feature_set=features,
    num_folds=num_folds,
    seed=SEED,
    n_trials=200,
    extended=False)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 102 -> Best value: 0.51891
Best hyperparameters:
learning_rate   - 0.09999999999999999
max_depth       - 9
min_child_weight - 14
gamma           - 0.2
alpha           - 0.55
lambda          - 133.49333146667357

[Time taken: 1062.01s]

-----Cross-validation and prediction-----
Fold #0: 0.53085 (196 rounds)
Fold #1: 0.52577 (93 rounds)
Fold #2: 0.52332 (56 rounds)
Fold #3: 0.46749 (47 rounds)
Fold #4: 0.46208 (42 rounds)
Fold #5: 0.57301 (83 rounds)
Fold #6: 0.52911 (69 rounds)

OOF score: 0.51891

[Time taken: 5.51s]

CPU times: user 21min 9s, sys: 19.6 s, total: 21min 28s
Wall time: 17min 47s


In [19]:
%%time
num_folds = 7
config = f'ext_{num_folds}f'

op[config], tp[config] = run_experiment(
    feature_set=features,
    num_folds=num_folds,
    seed=SEED,
    n_trials=200,
    extended=True)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 117 -> Best value: 0.51588
Best hyperparameters:
learning_rate   - 0.060000000000000005
max_depth       - 12
min_child_weight - 8
gamma           - 0.2
alpha           - 0.55
lambda          - 202.72509410770408

[Time taken: 1102.56s]

-----Cross-validation and prediction-----
Fold #0: 0.53333 (156 rounds)
Fold #1: 0.50894 (102 rounds)
Fold #2: 0.52237 (171 rounds)
Fold #3: 0.49212 (97 rounds)
Fold #4: 0.44460 (68 rounds)
Fold #5: 0.54600 (255 rounds)
Fold #6: 0.52363 (208 rounds)

OOF score: 0.51588

[Time taken: 8.71s]

CPU times: user 21min 56s, sys: 19 s, total: 22min 15s
Wall time: 18min 31s


# 10-folds

In [20]:
%%time
num_folds = 10
config = f'trn_{num_folds}f'

op[config], tp[config] = run_experiment(
    feature_set=features,
    num_folds=num_folds,
    seed=SEED,
    n_trials=200,
    extended=False)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 185 -> Best value: 0.51546
Best hyperparameters:
learning_rate   - 0.08
max_depth       - 9
min_child_weight - 2
gamma           - 0.2
alpha           - 2.2
lambda          - 147.4271301019588

[Time taken: 1906.78s]

-----Cross-validation and prediction-----
Fold #0: 0.54178 (60 rounds)
Fold #1: 0.51799 (130 rounds)
Fold #2: 0.52737 (53 rounds)
Fold #3: 0.52150 (87 rounds)
Fold #4: 0.48446 (53 rounds)
Fold #5: 0.44689 (54 rounds)
Fold #6: 0.43643 (48 rounds)
Fold #7: 0.55066 (81 rounds)
Fold #8: 0.51854 (72 rounds)
Fold #9: 0.54841 (113 rounds)

OOF score: 0.51546

[Time taken: 7.72s]

CPU times: user 36min 53s, sys: 38.1 s, total: 37min 31s
Wall time: 31min 54s


In [21]:
%%time
num_folds = 10
config = f'ext_{num_folds}f'

op[config], tp[config] = run_experiment(
    feature_set=features,
    num_folds=num_folds,
    seed=SEED,
    n_trials=200,
    extended=True)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 55 -> Best value: 0.51743
Best hyperparameters:
learning_rate   - 0.09999999999999999
max_depth       - 8
min_child_weight - 15
gamma           - 0.0
alpha           - 1.4000000000000001
lambda          - 10.534123248212456

[Time taken: 1481.70s]

-----Cross-validation and prediction-----
Fold #0: 0.55269 (51 rounds)
Fold #1: 0.52987 (57 rounds)
Fold #2: 0.51818 (250 rounds)
Fold #3: 0.49700 (43 rounds)
Fold #4: 0.48842 (43 rounds)
Fold #5: 0.47504 (96 rounds)
Fold #6: 0.44574 (41 rounds)
Fold #7: 0.54882 (163 rounds)
Fold #8: 0.55131 (102 rounds)
Fold #9: 0.54670 (66 rounds)

OOF score: 0.51743

[Time taken: 7.95s]

CPU times: user 29min 37s, sys: 28.3 s, total: 30min 6s
Wall time: 24min 49s


# 20-folds

In [22]:
%%time
num_folds = 20
config = f'trn_{num_folds}f'

op[config], tp[config] = run_experiment(
    feature_set=features,
    num_folds=num_folds,
    seed=SEED,
    n_trials=200,
    extended=False)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 185 -> Best value: 0.51752
Best hyperparameters:
learning_rate   - 0.09
max_depth       - 11
min_child_weight - 9
gamma           - 0.2
alpha           - 1.1500000000000001
lambda          - 79.75257250800276

[Time taken: 2741.63s]

-----Cross-validation and prediction-----
Fold #0: 0.52203 (119 rounds)
Fold #1: 0.52665 (76 rounds)
Fold #2: 0.57429 (38 rounds)
Fold #3: 0.49775 (66 rounds)
Fold #4: 0.48106 (61 rounds)
Fold #5: 0.57461 (49 rounds)
Fold #6: 0.49460 (57 rounds)
Fold #7: 0.50088 (39 rounds)
Fold #8: 0.48474 (50 rounds)
Fold #9: 0.49653 (65 rounds)
Fold #10: 0.46657 (44 rounds)
Fold #11: 0.49247 (71 rounds)
Fold #12: 0.44685 (46 rounds)
Fold #13: 0.43469 (47 rounds)
Fold #14: 0.51547 (70 rounds)
Fold #15: 0.58086 (85 rounds)
Fold #16: 0.53296 (156 rounds)
Fold #17: 0.53891 (80 rounds)
Fold #18: 0.51983 (117 rounds)
Fold #19: 0.57706 (78 rounds)

OOF score: 0.51752

[Time taken: 17.17s]

CPU times: user 55min 9s, sys: 56.

In [23]:
%%time
num_folds = 20
config = f'ext_{num_folds}f'

op[config], tp[config] = run_experiment(
    feature_set=features,
    num_folds=num_folds,
    seed=SEED,
    n_trials=200,
    extended=True)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 16 -> Best value: 0.51561
Best hyperparameters:
learning_rate   - 0.09
max_depth       - 7
min_child_weight - 9
gamma           - 0.2
alpha           - 1.1
lambda          - 24.723667786983253

[Time taken: 2866.56s]

-----Cross-validation and prediction-----
Fold #0: 0.52019 (188 rounds)
Fold #1: 0.56674 (106 rounds)
Fold #2: 0.53570 (100 rounds)
Fold #3: 0.50895 (165 rounds)
Fold #4: 0.48090 (65 rounds)
Fold #5: 0.59914 (62 rounds)
Fold #6: 0.49782 (55 rounds)
Fold #7: 0.49330 (72 rounds)
Fold #8: 0.50575 (43 rounds)
Fold #9: 0.47402 (175 rounds)
Fold #10: 0.45444 (67 rounds)
Fold #11: 0.47456 (223 rounds)
Fold #12: 0.45530 (41 rounds)
Fold #13: 0.43123 (47 rounds)
Fold #14: 0.50368 (104 rounds)
Fold #15: 0.60525 (117 rounds)
Fold #16: 0.51470 (100 rounds)
Fold #17: 0.54612 (146 rounds)
Fold #18: 0.55273 (99 rounds)
Fold #19: 0.58075 (278 rounds)

OOF score: 0.51561

[Time taken: 16.34s]

CPU times: user 57min 40s, sys: 53.7 s, to

In [24]:
!head 02_ext_20f_mean.csv

id,Hardness
10407,2.5861
10408,2.5505
10409,5.9508
10410,4.2916
10411,5.0037
10412,4.773
10413,3.547
10414,5.6948
10415,2.9139


**Time to submit!**