# Setup

In [1]:
import os
import gc
import time
import warnings

gc.enable()
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
pd.set_option('display.precision', 4)


import optuna
import lightgbm as lgb
from lightgbm import LGBMRegressor

from sklearn.model_selection import KFold
from sklearn.metrics import median_absolute_error

SEED = 23
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [2]:
optuna.__version__, lgb.__version__

('3.4.0', '3.3.2')

In [3]:
#remove cell to run future versions
assert optuna.__version__ == '3.4.0', f'Change in Optuna version. Original notebook version: 3.1.0'
assert lgb.__version__ == '3.3.2', f'Change in LightGBM version. Original notebook version: 3.3.2'

# Data Preparation

In [4]:
DATA_DIR = '/kaggle/input/playground-series-s3e25'
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

original_train = pd.read_csv(f'/kaggle/input/prediction-of-mohs-hardness-with-machine-learning/jm79zfps6b-1/Mineral_Dataset_Supplementary_Info.csv')
original_val = pd.read_csv(f'/kaggle/input/prediction-of-mohs-hardness-with-machine-learning/jm79zfps6b-1/Artificial_Crystals_Dataset.csv')

In [5]:
# fixing column names
original_val.rename({'Hardness (Mohs)': 'Hardness'}, axis=1, inplace=True)

# dropping irrelevant columns
train.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)
original_train.drop('Unnamed: 0', axis=1, inplace=True)
original_val.drop(['Unnamed: 0', 'Formula', 'Crystal structure'], axis=1, inplace=True)

# fixing column order
column_order = list(train.columns)
original_train = original_train[column_order]
original_val = original_val[column_order]

# combining original train and validation datasets
original = pd.concat([original_train, original_val], axis=0, ignore_index=True)

In [6]:
TARGET = 'Hardness'
features = list(test.columns)

# Baseline

In [7]:
# competition metric
def comp_metric(y_true, y_pred):
    return median_absolute_error(y_true, y_pred)

def lgbm_eval_metric(y_true, y_pred):
    return ('MedAE', median_absolute_error(y_true, y_pred), False)

In [8]:
%%time
X, y = train[features], train[TARGET]
oof_preds = {}

cv = KFold(n_splits=7, shuffle=True, random_state=SEED)
for fold, (train_ids, val_ids) in enumerate(cv.split(X, y)):
    X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
    X_val, y_val = X.iloc[val_ids], y.iloc[val_ids]
    
    model = LGBMRegressor(
        objective='quantile',
#         metric='quantile',
        alpha=0.5,
        boosting_type='goss',
        force_row_wise=True,
        device_type='cpu',
        random_state=SEED) 
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=25,
        eval_metric=lgbm_eval_metric,
        verbose=0)
        
    val_preds = model.predict(X_val)
    oof_preds.update(dict(zip(val_ids, val_preds)))
    
    score = comp_metric(y_val, val_preds)
    print(f'Fold #{fold}: {score:.4f}', end = ' | ')        
    _ = gc.collect()
    
oof_preds = pd.Series(oof_preds).sort_index()
print(f'OOF score: {comp_metric(y, oof_preds):.4f}\n')

Fold #0: 0.5237 | Fold #1: 0.5089 | Fold #2: 0.5805 | Fold #3: 0.5213 | Fold #4: 0.5002 | Fold #5: 0.5187 | Fold #6: 0.5518 | OOF score: 0.5246

CPU times: user 10 s, sys: 3.96 s, total: 14 s
Wall time: 10.6 s


# Hyperparameter tuning

In [9]:
def objective(trial, feature_set, model, num_folds, seed, extended):
    oof_preds = {}
    X, y = train[feature_set], train[TARGET]
    
    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, step=0.005),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 200, step=0.5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 200, step=0.5),
        'num_leaves': trial.suggest_int('num_leaves', 20, 2000, step=5),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 0, 1000, step=2),
        'min_split_gain': trial.suggest_float('min_split_gain', 0, 10, step=0.01),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.65, 1.0, step=0.05),
        'top_rate': trial.suggest_float('top_rate', 0.1, 0.5, step=0.05),
        'other_rate': trial.suggest_float('other_rate', 0.05, 0.5, step=0.05)
    }
    
    cv = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        if extended: # original data added only to training folds
            X_train = pd.concat([X_train, original[feature_set]], axis=0, ignore_index=True)
            y_train = pd.concat([y_train, original[TARGET]], axis=0, ignore_index=True)
        
        model.set_params(**param_grid)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=50,
            eval_metric=lgbm_eval_metric,
            verbose=0)

        val_preds = model.predict(X_val)
        oof_preds.update(dict(zip(val_idx, val_preds)))
        
    oof_preds = pd.Series(oof_preds).sort_index()
    return comp_metric(y, oof_preds)

In [10]:
def tune_params(feature_set, model, num_folds, seed, extended, n_trials, direction):
    study = optuna.create_study(
        sampler=optuna.samplers.TPESampler(
            consider_endpoints=True,
            multivariate=True,
            group=True,
            seed=seed),
        pruner=optuna.pruners.HyperbandPruner(),
        direction=direction    )
    study.optimize(
        func=lambda trial: objective(
            trial, feature_set, model, num_folds, seed, extended
        ),
        n_trials=n_trials,
        gc_after_trial=True    )
    return study

# Cross-validation

In [11]:
def cross_validate_predict(feature_set, model, num_folds, seed, extended):
    oof_preds = {}
    test_preds = {}
    scores = []

    X, y = train[feature_set], train[TARGET]
    X_test = test[feature_set]
       
    cv = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        if extended: # original data added only to training folds
            X_train = pd.concat([X_train, original[feature_set]], axis=0, ignore_index=True)
            y_train = pd.concat([y_train, original[TARGET]], axis=0, ignore_index=True)
        
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric=lgbm_eval_metric,
            early_stopping_rounds=50,
            verbose=False)

        val_preds = model.predict(X_val)
        test_preds[f'fold{fold}'] = model.predict(X_test)
        oof_preds.update(dict(zip(val_idx, val_preds)))

        score = comp_metric(y_val, val_preds)
        scores.append(score)
        print(f'Fold #{fold}: {score:.5f} ({model.best_iteration_} rounds)')
        _ = gc.collect()
    
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1)
    
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'\nOOF score: {comp_metric(y, oof_preds):.5f}\n')

    return oof_preds, test_preds

In [12]:
def run_experiment(feature_set, num_folds=5, seed=SEED, n_trials=200, extended=False):
    
    base_params = {
        'objective': 'quantile',
        'alpha': 0.5, 
        'n_estimators': 10000,
        'boosting_type': 'goss',
        'verbosity': -1,
        'force_row_wise': True,
        'device_type': 'cpu',
        'random_state': SEED
    }
    
    model = LGBMRegressor(**base_params)
    
    print(f'----------Hyperparameter tuning----------')
    start = time.time()
    study = tune_params(
        feature_set=feature_set,
        model=model,
        num_folds=num_folds,
        seed=seed,
        extended=extended,
        n_trials=n_trials, 
        direction='minimize' #metric: MedAE -> lower is better
    )
    end = time.time()
    print(f'Best trial: {study.best_trial.number} -> Best value: {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:15} - {v}')
    print(f'\n[Time taken: {end - start:.2f}s]\n')
    
    print(f'-----Cross-validation and prediction-----')
    start = time.time()
    model.set_params(**study.best_params)
    oof_preds, test_preds = cross_validate_predict(
        feature_set, model, num_folds, seed, extended
    )
    end = time.time()
    print(f'[Time taken: {end - start:.2f}s]\n')
    
    return oof_preds, test_preds

In [13]:
def create_submission_files(test_preds, config, notebook='01'):
    for col in test_preds.columns:
        sub = sample_sub.copy()
        sub[TARGET] = test_preds[col].round(4)
        sub.to_csv(f'{notebook}_{config}_{col}.csv', index=False)

In [14]:
op = {} # OOF preds 
tp = {} # Test preds

**Trial run**

In [15]:
optuna.logging.set_verbosity(optuna.logging.INFO)

In [16]:
_ , _ = run_experiment(feature_set=features, n_trials=5)

[I 2023-12-04 10:57:43,206] A new study created in memory with name: no-name-bc080965-18b8-413e-8748-9711c5efed8a


----------Hyperparameter tuning----------


[I 2023-12-04 10:57:44,246] Trial 0 finished with value: 0.6847227310647019 and parameters: {'learning_rate': 0.11, 'reg_alpha': 189.5, 'reg_lambda': 153.0, 'num_leaves': 580, 'max_depth': 4, 'min_child_samples': 686, 'min_split_gain': 1.67, 'colsample_bytree': 0.8, 'top_rate': 0.35, 'other_rate': 0.25}. Best is trial 0 with value: 0.6847227310647019.
[I 2023-12-04 10:57:46,975] Trial 1 finished with value: 0.8234785369424595 and parameters: {'learning_rate': 0.01, 'reg_alpha': 177.0, 'reg_lambda': 177.0, 'num_leaves': 615, 'max_depth': 7, 'min_child_samples': 980, 'min_split_gain': 8.45, 'colsample_bytree': 0.65, 'top_rate': 0.2, 'other_rate': 0.15000000000000002}. Best is trial 0 with value: 0.6847227310647019.
[I 2023-12-04 10:57:48,312] Trial 2 finished with value: 0.6749224516779684 and parameters: {'learning_rate': 0.17, 'reg_alpha': 125.5, 'reg_lambda': 22.0, 'num_leaves': 20, 'max_depth': 10, 'min_child_samples': 140, 'min_split_gain': 4.22, 'colsample_bytree': 0.75, 'top_rate'

Best trial: 2 -> Best value: 0.67492
Best hyperparameters:
learning_rate   - 0.17
reg_alpha       - 125.5
reg_lambda      - 22.0
num_leaves      - 20
max_depth       - 10
min_child_samples - 140
min_split_gain  - 4.22
colsample_bytree - 0.75
top_rate        - 0.45000000000000007
other_rate      - 0.25

[Time taken: 7.66s]

-----Cross-validation and prediction-----
Fold #0: 0.69312 (18 rounds)
Fold #1: 0.64975 (26 rounds)
Fold #2: 0.68083 (29 rounds)
Fold #3: 0.65261 (24 rounds)
Fold #4: 0.66341 (22 rounds)

OOF score: 0.67492

[Time taken: 1.62s]



In [17]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

# 7 - Folds

In [18]:
%%time
num_folds = 7
config = f'trn_{num_folds}f'

op[config], tp[config] = run_experiment(
    feature_set=features,
    num_folds=num_folds,
    seed=SEED,
    n_trials=200,
    extended=False)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 177 -> Best value: 0.52420
Best hyperparameters:
learning_rate   - 0.14500000000000002
reg_alpha       - 0.5
reg_lambda      - 46.0
num_leaves      - 235
max_depth       - 6
min_child_samples - 102
min_split_gain  - 0.05
colsample_bytree - 0.7000000000000001
top_rate        - 0.15000000000000002
other_rate      - 0.1

[Time taken: 726.99s]

-----Cross-validation and prediction-----
Fold #0: 0.51181 (89 rounds)
Fold #1: 0.50370 (237 rounds)
Fold #2: 0.55652 (130 rounds)
Fold #3: 0.53057 (124 rounds)
Fold #4: 0.50666 (84 rounds)
Fold #5: 0.51007 (202 rounds)
Fold #6: 0.55053 (36 rounds)

OOF score: 0.52420

[Time taken: 8.40s]

CPU times: user 16min 17s, sys: 3min 46s, total: 20min 4s
Wall time: 12min 15s


In [19]:
%%time
num_folds = 7
config = f'ext_{num_folds}f'

op[config], tp[config] = run_experiment(
    feature_set=features,
    num_folds=num_folds,
    seed=SEED,
    n_trials=200,
    extended=True)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 149 -> Best value: 0.52382
Best hyperparameters:
learning_rate   - 0.165
reg_alpha       - 5.5
reg_lambda      - 0.5
num_leaves      - 1180
max_depth       - 7
min_child_samples - 66
min_split_gain  - 0.2
colsample_bytree - 0.7000000000000001
top_rate        - 0.45000000000000007
other_rate      - 0.35000000000000003

[Time taken: 659.09s]

-----Cross-validation and prediction-----
Fold #0: 0.50652 (75 rounds)
Fold #1: 0.50217 (63 rounds)
Fold #2: 0.55645 (61 rounds)
Fold #3: 0.51367 (38 rounds)
Fold #4: 0.50031 (31 rounds)
Fold #5: 0.52795 (109 rounds)
Fold #6: 0.54502 (58 rounds)

OOF score: 0.52382

[Time taken: 5.71s]

CPU times: user 14min 14s, sys: 3min 31s, total: 17min 46s
Wall time: 11min 4s


# 10 - Folds

In [20]:
%%time
num_folds = 10
config = f'trn_{num_folds}f'

op[config], tp[config] = run_experiment(
    feature_set=features,
    num_folds=num_folds,
    seed=SEED,
    n_trials=200,
    extended=False)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 121 -> Best value: 0.51608
Best hyperparameters:
learning_rate   - 0.14500000000000002
reg_alpha       - 3.0
reg_lambda      - 7.5
num_leaves      - 1310
max_depth       - 9
min_child_samples - 126
min_split_gain  - 0.09
colsample_bytree - 0.8
top_rate        - 0.35
other_rate      - 0.3

[Time taken: 1142.32s]

-----Cross-validation and prediction-----
Fold #0: 0.48172 (178 rounds)
Fold #1: 0.50276 (173 rounds)
Fold #2: 0.48289 (117 rounds)
Fold #3: 0.54669 (37 rounds)
Fold #4: 0.51441 (196 rounds)
Fold #5: 0.55191 (146 rounds)
Fold #6: 0.46493 (143 rounds)
Fold #7: 0.51647 (64 rounds)
Fold #8: 0.53497 (29 rounds)
Fold #9: 0.52216 (116 rounds)

OOF score: 0.51608

[Time taken: 15.51s]

CPU times: user 25min 12s, sys: 6min 32s, total: 31min 45s
Wall time: 19min 18s


In [21]:
%%time
num_folds = 10
config = f'ext_{num_folds}f'

op[config], tp[config] = run_experiment(
    feature_set=features,
    num_folds=num_folds,
    seed=SEED,
    n_trials=200,
    extended=True)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 168 -> Best value: 0.51952
Best hyperparameters:
learning_rate   - 0.125
reg_alpha       - 3.0
reg_lambda      - 14.5
num_leaves      - 1180
max_depth       - 8
min_child_samples - 24
min_split_gain  - 0.22
colsample_bytree - 0.8500000000000001
top_rate        - 0.45000000000000007
other_rate      - 0.15000000000000002

[Time taken: 1303.44s]

-----Cross-validation and prediction-----
Fold #0: 0.50900 (37 rounds)
Fold #1: 0.50687 (85 rounds)
Fold #2: 0.49673 (106 rounds)
Fold #3: 0.56337 (49 rounds)
Fold #4: 0.54099 (79 rounds)
Fold #5: 0.52730 (72 rounds)
Fold #6: 0.47261 (54 rounds)
Fold #7: 0.52604 (73 rounds)
Fold #8: 0.51382 (37 rounds)
Fold #9: 0.54236 (75 rounds)

OOF score: 0.51952

[Time taken: 14.92s]

CPU times: user 27min 59s, sys: 7min 34s, total: 35min 34s
Wall time: 21min 58s


In [22]:
!head 01_ext_10f_mean.csv

id,Hardness
10407,2.5303
10408,2.5949
10409,5.9703
10410,4.2679
10411,4.8864
10412,5.2637
10413,3.643
10414,5.6262
10415,3.1919


**Time to submit!**