# Setup

In [1]:
%%capture
!pip install --upgrade optuna_integration

In [2]:
import gc
gc.enable()

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import optuna.integration.lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.base import clone
from lightgbm import LGBMClassifier

SEED = 2024

In [3]:
DATA_DIR = '/kaggle/input/ml-olympiad-predicting-earthquake-damage'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

# Data overview

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 37 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   building_id                             4000 non-null   int64 
 1   count_floors_pre_eq                     4000 non-null   int64 
 2   age                                     4000 non-null   int64 
 3   area_percentage                         4000 non-null   int64 
 4   height_percentage                       4000 non-null   int64 
 5   land_surface_condition                  4000 non-null   object
 6   foundation_type                         4000 non-null   object
 7   roof_type                               4000 non-null   object
 8   ground_floor_type                       4000 non-null   object
 9   other_floor_type                        4000 non-null   object
 10  position                                4000 non-null   object
 11  plan

In [5]:
train.shape, test.shape, sample_sub.shape

((4000, 37), (1000, 36), (1000, 2))

In [6]:
train.isna().sum().sum(), test.isna().sum().sum()

(0, 0)

In [7]:
TARGET = 'damage_grade'
train[TARGET].value_counts(normalize=True)

damage_grade
2    0.49200
3    0.32575
1    0.18225
Name: proportion, dtype: float64

In [8]:
train[TARGET] = train[TARGET] - 1  # reversed in postprocessing
train[TARGET].value_counts(normalize=True)

damage_grade
1    0.49200
2    0.32575
0    0.18225
Name: proportion, dtype: float64

In [9]:
train.nunique()

building_id                               4000
count_floors_pre_eq                          8
age                                        100
area_percentage                             39
height_percentage                           15
land_surface_condition                       3
foundation_type                              5
roof_type                                    3
ground_floor_type                            5
other_floor_type                             4
position                                     4
plan_configuration                          10
has_superstructure_adobe_mud                 2
has_superstructure_mud_mortar_stone          2
has_superstructure_stone_flag                2
has_superstructure_cement_mortar_stone       2
has_superstructure_mud_mortar_brick          2
has_superstructure_cement_mortar_brick       2
has_superstructure_timber                    2
has_superstructure_bamboo                    2
has_superstructure_rc_non_engineered         2
has_superstru

# Data Preparation

In [10]:
train = train.drop('building_id', axis=1)
test = test.drop('building_id', axis=1)

In [11]:
train = pd.get_dummies(train, dtype='int8')
test = pd.get_dummies(test, dtype='int8')

In [12]:
features = list(test.columns)
num_features = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'count_families']
cat_features = [f for f in features if f not in num_features]

In [13]:
# LightGBM dataset
dtrain = lgb.Dataset(
    data=train[features],
    label=train[TARGET],
    feature_name=features,
    categorical_feature=cat_features)

# Hyperparameter tuning

In [14]:
base_params = {
    'objective': 'multiclass_ova',
    'num_class': 3,
    'is_unbalance': True,
    'metric': 'multi_error',
    'learning_rate': 0.005,
    'boosting_type': 'gbdt',
    'force_row_wise': True,
    'verbosity': -1,
    'n_jobs': -1,
    'deterministic': True,
    'random_state': SEED
}

In [15]:
early_stopping = lgb.early_stopping(
    stopping_rounds=100,
    first_metric_only=True,
    verbose=False,
    min_delta=1e-4)

In [16]:
BUDGET = 60 * 60 * 1.5
NUM_FOLDS = 7

tuner = lgb.LightGBMTunerCV(
    time_budget=BUDGET,
    optuna_seed=SEED,
    params=base_params,
    train_set=dtrain,
    num_boost_round=5000,
    nfold=NUM_FOLDS,
    stratified=True,
    shuffle=True,
    seed=SEED,
    feature_name=features,
    categorical_feature=cat_features,
    callbacks=[early_stopping])

[I 2024-04-20 04:06:57,021] A new study created in memory with name: no-name-3f4e57bf-806c-465a-8f1e-82fc4cac59a2


In [17]:
%%time
tuner.run()

feature_fraction, val_score: 0.449498:  14%|#4        | 1/7 [00:40<04:02, 40.46s/it][I 2024-04-20 04:07:37,500] Trial 0 finished with value: 0.4494984000237941 and parameters: {'feature_fraction': 0.5}. Best is trial 0 with value: 0.4494984000237941.
feature_fraction, val_score: 0.449498:  29%|##8       | 2/7 [01:13<02:59, 35.89s/it][I 2024-04-20 04:08:10,186] Trial 1 finished with value: 0.4519919660024739 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 0 with value: 0.4494984000237941.
feature_fraction, val_score: 0.449498:  43%|####2     | 3/7 [01:49<02:24, 36.10s/it][I 2024-04-20 04:08:46,550] Trial 2 finished with value: 0.4504930271129921 and parameters: {'feature_fraction': 0.7}. Best is trial 0 with value: 0.4494984000237941.
feature_fraction, val_score: 0.449498:  57%|#####7    | 4/7 [02:24<01:47, 35.67s/it][I 2024-04-20 04:09:21,564] Trial 3 finished with value: 0.44974377636374124 and parameters: {'feature_fraction': 0.8}. Best is trial 0 with value: 

CPU times: user 44min 4s, sys: 55min 40s, total: 1h 39min 44s
Wall time: 1h 30min 3s





In [18]:
print(f'Best score: {tuner.best_score:.5f}')
print(f'Best hyperparameters:')
for k, v in tuner.best_params.items():
    print(f'{k:20} - {v}')

Best score: 0.44275
Best hyperparameters:
objective            - multiclass_ova
num_class            - 3
is_unbalance         - True
metric               - multi_error
learning_rate        - 0.005
boosting_type        - gbdt
force_row_wise       - True
verbosity            - -1
n_jobs               - -1
deterministic        - True
random_state         - 2024
feature_pre_filter   - False
lambda_l1            - 0.0
lambda_l2            - 0.0
num_leaves           - 107
feature_fraction     - 0.5
bagging_fraction     - 0.5736183015264869
bagging_freq         - 3
min_child_samples    - 20


# Cross-validation

In [19]:
def comp_metric(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

def custom_cv(estimator, seed=SEED, verbose=True):
    X_test = test[features]
    
    oof_preds, test_preds = {}, {}
    scores = []

    cv = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(train, train[TARGET])):
        X_train, y_train = train[features].iloc[train_ids], train[TARGET].iloc[train_ids]
        X_val, y_val = train[features].iloc[val_ids], train[TARGET].iloc[val_ids]
        
        model = clone(estimator)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=[early_stopping])

        val_preds = model.predict(X_val)
        oof_preds.update(dict(zip(val_ids, val_preds)))
        test_preds[f'fold{fold}'] = model.predict(X_test)

        score = comp_metric(y_val, val_preds)
        scores.append(score)
        if verbose:
            print(f'Fold #{fold:>2}: {score:.5f} ({model.best_iteration_:>4} rounds)')
        _ = gc.collect()

    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mode'] = test_preds.mode(axis=1)[0].astype('int') # mode of fold-wise predictions
    
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'\nAvg score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    print(f'OOF score: {comp_metric(train[TARGET], oof_preds):.5f}\n')
    
    return oof_preds, test_preds

In [20]:
%%time
model = LGBMClassifier(**tuner.best_params, n_estimators=5000)
op, tp = custom_cv(model)

Fold # 0: 0.56220 ( 173 rounds)
Fold # 1: 0.53801 ( 157 rounds)
Fold # 2: 0.54660 ( 210 rounds)
Fold # 3: 0.50799 ( 142 rounds)
Fold # 4: 0.45906 ( 106 rounds)
Fold # 5: 0.54353 ( 142 rounds)
Fold # 6: 0.57510 ( 195 rounds)

Avg score: 0.53321 +/- 0.03593
OOF score: 0.53977

CPU times: user 9.68 s, sys: 0 ns, total: 9.68 s
Wall time: 9.68 s


In [21]:
def create_submission_files(preds, notebook='00'):
    for col in preds.columns:
        sub = sample_sub.copy()
        # inverse transform for the target
        sub[TARGET] = (preds[col] + 1)
        sub.to_csv(f'nb{notebook}_{col}.csv', index=False)

In [22]:
create_submission_files(tp)