<a href="https://colab.research.google.com/github/stiwari-ds/data-science-competitions/blob/main/zindi/landslide_prevention/notebooks/04_lgbm_thresholding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
%%capture
!pip install --upgrade optuna
!pip install --upgrade lightgbm

In [2]:
import os
import gc
import time
import warnings
import subprocess

gc.enable()
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)
np.set_printoptions(precision=4)

import optuna
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner

optuna.logging.set_verbosity(optuna.logging.INFO)

import lightgbm as lgb
from lightgbm import LGBMClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

In [3]:
#remove cell to run future versions
assert optuna.__version__ == '3.0.2', f'Change in Optuna version. Original notebook version: 3.0.2'
assert lgb.__version__ == '3.3.2', f'Change in LightGBM version. Original notebook version: 3.3.2'

In [4]:
#Check GPU availability
try:
    subprocess.check_output('nvidia-smi')
    HAVE_GPU = True
except Exception:
    HAVE_GPU = False

print(f'GPU available: {HAVE_GPU}')

GPU available: False


In [5]:
SEED = 23
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [6]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Data Preparation

In [7]:
DATA_PATH = '/content/drive/MyDrive/data_science_competitions/zindi/landslide_prevention/data'

train = pd.read_csv(f'{DATA_PATH}/raw/train.csv')
test = pd.read_csv(f'{DATA_PATH}/raw/test.csv')
sample_sub = pd.read_csv(f'{DATA_PATH}/raw/sample_sub.csv')

train_agg = pd.read_csv(f'{DATA_PATH}/processed/train_agg.csv')
test_agg = pd.read_csv(f'{DATA_PATH}/processed/test_agg.csv')

In [8]:
TARGET = train['Label']

In [9]:
cat_features = [f for f in test.columns if f.endswith('geology')]
cat_features.extend(['agg_geology_mode', 'agg_geology_nunique'])

In [10]:
def preprocess(df: pd.DataFrame, is_train: bool = False) -> pd.DataFrame:
    
    df = df.drop('Sample_ID', axis=1)
    if is_train:
        df = df.drop('Label', axis=1)
    
    #reduce memory usage
    def reduce_mem(df: pd.DataFrame) -> pd.DataFrame:
        for col in df.columns:
            col_type = df[col].dtypes
            if col_type in ['int16', 'int32', 'int64', 'float32', 'float64']:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type).startswith('int'):
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
        return df

    df = reduce_mem(df)

    return df

In [11]:
train = preprocess(train, is_train=True)
test = preprocess(test)

In [12]:
train_agg = preprocess(train_agg, is_train=True)
test_agg = preprocess(test_agg)

In [13]:
gc.collect()

22

# Baseline

In [14]:
def get_best_threshold(y_true, pred_probs):
    candidate_thresholds = np.arange(0, 1, 0.005)
    candidate_scores = [f1_score(y_true, (pred_probs >= t).astype('int')) 
                        for t in candidate_thresholds]
    best_threshold = candidate_thresholds[np.argmax(candidate_scores)]
    return best_threshold

In [15]:
%%time
scores_f1 = []
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
X, y = train, TARGET
feature_name = list(X.columns)
categorical_feature = [f for f in feature_name if f in cat_features]
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.loc[val_idx], y.iloc[val_idx]

    model = LGBMClassifier(
        objective='binary',
        boosting_type='goss',
        device_type='cpu',
        random_state=SEED
    ) 
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=100,
        eval_metric='binary_logloss',
        feature_name=feature_name,
        categorical_feature=categorical_feature,
        verbose=0
    )
    val_probs = model.predict_proba(X_val)[:, 1]
    best_threshold = get_best_threshold(y_val, val_probs)
    val_preds = (val_probs >= best_threshold).astype('int')

    score = f1_score(y_val, val_preds)
    scores_f1.append(score)
    print(f'Fold #{fold}: ({model.best_iteration_} rounds) F1-score = {score:.5f}')
    _ = gc.collect()

print(f'\nAvg F1-score = {np.mean(scores_f1):.5f} +/- {np.std(scores_f1):.5f}\n')

Fold #0: (91 rounds) F1-score = 0.68200
Fold #1: (98 rounds) F1-score = 0.71553
Fold #2: (98 rounds) F1-score = 0.70345
Fold #3: (79 rounds) F1-score = 0.69725
Fold #4: (64 rounds) F1-score = 0.71372

Avg F1-score = 0.70239 +/- 0.01220

CPU times: user 25.7 s, sys: 120 ms, total: 25.8 s
Wall time: 20 s


# Hyperparameter tuning

In [16]:
def objective(trial, data, base_params):

    scores = []
    X, y = data
    feature_name = list(X.columns)
    categorical_feature = [f for f in feature_name if f in cat_features]

    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, step=0.01),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 200, step=0.1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 200, step=0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 1000, step=5),
        # 'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 0, 500, step=2),
        'min_split_gain': trial.suggest_float('min_split_gain', 0, 15, step=0.01),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.95, step=0.05),
        'top_rate': trial.suggest_float('top_rate', 0.1, 0.5, step=0.05),
        'other_rate': trial.suggest_float('other_rate', 0.05, 0.5, step=0.05),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 5, step=0.05)
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model = LGBMClassifier(**base_params, **param_grid)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='binary_logloss',
            early_stopping_rounds=100,
            feature_name=feature_name,
            categorical_feature=categorical_feature,
            verbose=False
        )
        val_probs = model.predict_proba(X_val)[:, 1]
        best_threshold = get_best_threshold(y_val, val_probs)
        val_preds = (val_probs >= best_threshold).astype('int')
        scores.append(f1_score(y_val, val_preds))
    
    return np.mean(scores)

In [17]:
def tune_params(data, base_params, n_trials=10, direction='maximize'):
    study = optuna.create_study(
        sampler=TPESampler(seed=SEED),
        pruner=HyperbandPruner(),
        direction=direction
    )
    
    study.optimize(
        func=lambda trial: objective(trial, data, base_params),
        n_trials=n_trials,
        gc_after_trial=True
    )
    
    return study

# Cross-validation

In [18]:
def evaluate_model(data, model_params, verbose=True):
    oof_probs = {}  #out-of-fold predicted probabilities on train set
    test_preds = {} #predictions on test set for each fold
    scores_f1 = [] #F1 scores on validation set

    X, X_test, y = data
    feature_name = list(X.columns)
    categorical_feature = [f for f in feature_name if f in cat_features]

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model = LGBMClassifier(**model_params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='binary_logloss',
            early_stopping_rounds=100,
            feature_name=feature_name,
            categorical_feature=categorical_feature,
            verbose=False
        )
        val_probs = model.predict_proba(X_val)[:, 1]
        best_threshold = get_best_threshold(y_val, val_probs)
        val_preds = (val_probs >= best_threshold).astype('int')
        oof_probs.update(dict(zip(val_idx, val_preds)))
        
        test_probs = model.predict_proba(X_test)[:, 1]
        test_preds[f'fold{fold}'] = (test_probs >= best_threshold).astype('int')

        f1 = f1_score(y_val, val_preds)
        scores_f1.append(f1)
        if verbose:
            print(f'Fold #{fold}: ({model.best_iteration_} rounds) F1 = {f1:.5f}')
        
        _ = gc.collect()

    print(f'\nAvg F1 = {np.mean(scores_f1):.5f} +/- {np.std(scores_f1):.5f}')
    
    oof_probs = pd.Series(oof_probs).sort_index()
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mode'] = test_preds.mode(axis=1)[0]

    return oof_probs, test_preds

In [19]:
def run_experiment(data, n_trials=5):
        
    X, X_test, y = data
    
    base_params = {
        'objective': 'binary',
        'n_estimators': 10000,
        'boosting_type': 'goss',
        'extra_trees': True,
        'verbosity': -1,
        'device_type': 'cpu',
        'random_state': SEED
    }
    
    print(f'---------------Hyperparameter tuning---------------')
    study = tune_params(
        data=(X, y), 
        base_params=base_params,
        n_trials=n_trials,
        direction='maximize'
    )
    print(f'Best trial: {study.best_trial.number} -> Best value(F1): {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:20} - {v}')
    
    model_params = {**base_params, **study.best_params}
    print(f'-----------------Cross-validation------------------')
    oof_probs, test_preds = evaluate_model(
        data=(X, X_test, y), 
        model_params=model_params
    )
    return oof_probs, test_preds

In [20]:
oof_probs = pd.DataFrame()

### Generating submission files

In [21]:
NOTEBOOK = '04'
SUBMISSION_PATH = f'/content/drive/MyDrive/data_science_competitions/zindi/landslide_prevention/submissions/nb_{NOTEBOOK}'
if not os.path.isdir(SUBMISSION_PATH):
    os.makedirs(SUBMISSION_PATH)

In [22]:
def create_submission_files(test_preds: pd.DataFrame, expt_num: int):
    for col in (test_preds.columns):
        sub = sample_sub.copy()
        sub['Label'] = test_preds[col]
        sub.to_csv(f'{SUBMISSION_PATH}/{expt_num}_{col}.csv', index=False)

### Test experiment

In [23]:
%%time
o, t = run_experiment(
    data=(train, test, TARGET),
    n_trials=5
)

[32m[I 2022-09-30 22:49:15,235][0m A new study created in memory with name: no-name-e3e60ca7-3256-4177-b43e-22f7a0039db1[0m


---------------Hyperparameter tuning---------------


[32m[I 2022-09-30 22:49:20,229][0m Trial 0 finished with value: 0.623006934531691 and parameters: {'learning_rate': 0.16, 'reg_alpha': 189.4, 'reg_lambda': 153.1, 'num_leaves': 295, 'min_child_samples': 110, 'min_split_gain': 10.3, 'colsample_bytree': 0.55, 'top_rate': 0.25, 'other_rate': 0.35000000000000003, 'scale_pos_weight': 2.6500000000000004}. Best is trial 0 with value: 0.623006934531691.[0m
[32m[I 2022-09-30 22:49:43,807][0m Trial 1 finished with value: 0.6245623629584189 and parameters: {'learning_rate': 0.01, 'reg_alpha': 176.8, 'reg_lambda': 177.0, 'num_leaves': 315, 'min_child_samples': 294, 'min_split_gain': 14.68, 'colsample_bytree': 0.9, 'top_rate': 0.1, 'other_rate': 0.15000000000000002, 'scale_pos_weight': 2.1500000000000004}. Best is trial 1 with value: 0.6245623629584189.[0m
[32m[I 2022-09-30 22:49:58,221][0m Trial 2 finished with value: 0.6392284571919771 and parameters: {'learning_rate': 0.25, 'reg_alpha': 125.2, 'reg_lambda': 22.1, 'num_leaves': 20, 'min_c

Best trial: 3 -> Best value(F1): 0.66011
Best hyperparameters:
learning_rate        - 0.25
reg_alpha            - 143.6
reg_lambda           - 23.8
num_leaves           - 605
min_child_samples    - 64
min_split_gain       - 1.16
colsample_bytree     - 0.9
top_rate             - 0.30000000000000004
other_rate           - 0.1
scale_pos_weight     - 3.2
-----------------Cross-validation------------------
Fold #0: (338 rounds) F1 = 0.64972
Fold #1: (629 rounds) F1 = 0.65943
Fold #2: (444 rounds) F1 = 0.65869
Fold #3: (265 rounds) F1 = 0.65071
Fold #4: (365 rounds) F1 = 0.68202

Avg F1 = 0.66011 +/- 0.01165
CPU times: user 1min 40s, sys: 1.48 s, total: 1min 41s
Wall time: 1min 3s


### Experiment 1: Original features

In [34]:
%%time
oof_probs['4_1'], test_preds_1 = run_experiment(
    data=(train, test, TARGET),
    n_trials=250
)

[32m[I 2022-09-30 15:23:50,839][0m A new study created in memory with name: no-name-01cdf0d6-bc0c-469d-8969-38e86757feb2[0m


---------------Hyperparameter tuning---------------


[32m[I 2022-09-30 15:23:56,677][0m Trial 0 finished with value: 0.623006934531691 and parameters: {'learning_rate': 0.16, 'reg_alpha': 189.4, 'reg_lambda': 153.1, 'num_leaves': 295, 'min_child_samples': 110, 'min_split_gain': 10.3, 'colsample_bytree': 0.55, 'top_rate': 0.25, 'other_rate': 0.35000000000000003, 'scale_pos_weight': 2.6500000000000004}. Best is trial 0 with value: 0.623006934531691.[0m
[32m[I 2022-09-30 15:24:18,340][0m Trial 1 finished with value: 0.6245623629584189 and parameters: {'learning_rate': 0.01, 'reg_alpha': 176.8, 'reg_lambda': 177.0, 'num_leaves': 315, 'min_child_samples': 294, 'min_split_gain': 14.68, 'colsample_bytree': 0.9, 'top_rate': 0.1, 'other_rate': 0.15000000000000002, 'scale_pos_weight': 2.1500000000000004}. Best is trial 1 with value: 0.6245623629584189.[0m
[32m[I 2022-09-30 15:24:25,552][0m Trial 2 finished with value: 0.6392284571919771 and parameters: {'learning_rate': 0.25, 'reg_alpha': 125.2, 'reg_lambda': 22.1, 'num_leaves': 20, 'min_c

Best trial: 170 -> Best value(F1): 0.71173
Best hyperparameters:
learning_rate        - 0.14
reg_alpha            - 6.0
reg_lambda           - 115.10000000000001
num_leaves           - 715
min_child_samples    - 118
min_split_gain       - 0.45
colsample_bytree     - 0.6
top_rate             - 0.1
other_rate           - 0.4
scale_pos_weight     - 2.6500000000000004
-----------------Cross-validation------------------
Fold #0: (1349 rounds) F1 = 0.68634
Fold #1: (1181 rounds) F1 = 0.71875
Fold #2: (1637 rounds) F1 = 0.70690
Fold #3: (1368 rounds) F1 = 0.71240
Fold #4: (1399 rounds) F1 = 0.73426

Avg F1 = 0.71173 +/- 0.01566
CPU times: user 2h 41min 39s, sys: 1min 47s, total: 2h 43min 27s
Wall time: 1h 26min 8s


In [35]:
create_submission_files(test_preds_1, 1)

### Experiment 2: Created features

In [24]:
created_features = [f for f in test_agg.columns 
                    if f.startswith('agg_') or f.startswith('grad_')]

In [25]:
%%time
oof_probs['4_2'], test_preds_2 = run_experiment(
    data=(train_agg[created_features], test_agg[created_features], TARGET),
    n_trials=100
)

[32m[I 2022-09-30 22:50:32,087][0m A new study created in memory with name: no-name-8446eca5-34a8-48fd-b2e5-893fef8d1323[0m


---------------Hyperparameter tuning---------------


[32m[I 2022-09-30 22:50:41,243][0m Trial 0 finished with value: 0.6370458471543539 and parameters: {'learning_rate': 0.16, 'reg_alpha': 189.4, 'reg_lambda': 153.1, 'num_leaves': 295, 'min_child_samples': 110, 'min_split_gain': 10.3, 'colsample_bytree': 0.55, 'top_rate': 0.25, 'other_rate': 0.35000000000000003, 'scale_pos_weight': 2.6500000000000004}. Best is trial 0 with value: 0.6370458471543539.[0m
[32m[I 2022-09-30 22:51:21,325][0m Trial 1 finished with value: 0.6488787819005271 and parameters: {'learning_rate': 0.01, 'reg_alpha': 176.8, 'reg_lambda': 177.0, 'num_leaves': 315, 'min_child_samples': 294, 'min_split_gain': 14.68, 'colsample_bytree': 0.9, 'top_rate': 0.1, 'other_rate': 0.15000000000000002, 'scale_pos_weight': 2.1500000000000004}. Best is trial 1 with value: 0.6488787819005271.[0m
[32m[I 2022-09-30 22:51:31,535][0m Trial 2 finished with value: 0.6615567841134624 and parameters: {'learning_rate': 0.25, 'reg_alpha': 125.2, 'reg_lambda': 22.1, 'num_leaves': 20, 'min

Best trial: 94 -> Best value(F1): 0.71967
Best hyperparameters:
learning_rate        - 0.060000000000000005
reg_alpha            - 4.6000000000000005
reg_lambda           - 5.6000000000000005
num_leaves           - 25
min_child_samples    - 48
min_split_gain       - 0.55
colsample_bytree     - 0.5
top_rate             - 0.15000000000000002
other_rate           - 0.45
scale_pos_weight     - 4.300000000000001
-----------------Cross-validation------------------
Fold #0: (1896 rounds) F1 = 0.69111
Fold #1: (1885 rounds) F1 = 0.72297
Fold #2: (1769 rounds) F1 = 0.70467
Fold #3: (2128 rounds) F1 = 0.74410
Fold #4: (2421 rounds) F1 = 0.73547

Avg F1 = 0.71967 +/- 0.01948
CPU times: user 1h 27min 29s, sys: 55.4 s, total: 1h 28min 25s
Wall time: 47min


In [26]:
create_submission_files(test_preds_2, 2)

### Experiment 3: All features

In [34]:
%%time
oof_probs['4_3'], test_preds_3 = run_experiment(
    data=(train_agg, test_agg, TARGET),
    n_trials=100
)

[32m[I 2022-09-30 23:59:49,454][0m A new study created in memory with name: no-name-06e5f67d-7c3e-4135-af3f-eb600c4d699c[0m


---------------Hyperparameter tuning---------------


[32m[I 2022-10-01 00:00:02,212][0m Trial 0 finished with value: 0.6348873850997755 and parameters: {'learning_rate': 0.16, 'reg_alpha': 189.4, 'reg_lambda': 153.1, 'num_leaves': 295, 'min_child_samples': 110, 'min_split_gain': 10.3, 'colsample_bytree': 0.55, 'top_rate': 0.25, 'other_rate': 0.35000000000000003, 'scale_pos_weight': 2.6500000000000004}. Best is trial 0 with value: 0.6348873850997755.[0m
[32m[I 2022-10-01 00:00:58,297][0m Trial 1 finished with value: 0.6437631944129582 and parameters: {'learning_rate': 0.01, 'reg_alpha': 176.8, 'reg_lambda': 177.0, 'num_leaves': 315, 'min_child_samples': 294, 'min_split_gain': 14.68, 'colsample_bytree': 0.9, 'top_rate': 0.1, 'other_rate': 0.15000000000000002, 'scale_pos_weight': 2.1500000000000004}. Best is trial 1 with value: 0.6437631944129582.[0m
[32m[I 2022-10-01 00:01:13,095][0m Trial 2 finished with value: 0.6617946013981475 and parameters: {'learning_rate': 0.25, 'reg_alpha': 125.2, 'reg_lambda': 22.1, 'num_leaves': 20, 'min

Best trial: 64 -> Best value(F1): 0.72613
Best hyperparameters:
learning_rate        - 0.05
reg_alpha            - 14.0
reg_lambda           - 95.4
num_leaves           - 120
min_child_samples    - 150
min_split_gain       - 0.05
colsample_bytree     - 0.75
top_rate             - 0.1
other_rate           - 0.5
scale_pos_weight     - 4.15
-----------------Cross-validation------------------
Fold #0: (3919 rounds) F1 = 0.70072
Fold #1: (2988 rounds) F1 = 0.73140
Fold #2: (2743 rounds) F1 = 0.70352
Fold #3: (4532 rounds) F1 = 0.75342
Fold #4: (3910 rounds) F1 = 0.74161

Avg F1 = 0.72613 +/- 0.02083
CPU times: user 2h 10min 40s, sys: 52.6 s, total: 2h 11min 33s
Wall time: 1h 9min 31s


In [35]:
create_submission_files(test_preds_3, 3)

### Storing OOF predictions

In [36]:
oof_probs.to_csv(f'{SUBMISSION_PATH}/oof_probs.csv', index=False)