<a href="https://colab.research.google.com/github/stiwari-ds/data-science-competitions/blob/main/zindi/trailblazers_open2all/notebooks/02_lightgbm_goss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
%%capture
!pip install --upgrade optuna
!pip install --upgrade lightgbm

In [2]:
import os
import gc
import time
import warnings
import subprocess

gc.enable()
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)
np.set_printoptions(precision=4)

import optuna
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
from optuna.integration.lightgbm import LightGBMPruningCallback

optuna.logging.set_verbosity(optuna.logging.INFO)

import lightgbm as lgb
from lightgbm import LGBMRegressor

from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error

In [3]:
#remove cell to run future versions
assert optuna.__version__ == '3.0.2', f'Change in Optuna version. Original notebook version: 3.0.2'
assert lgb.__version__ == '3.3.2', f'Change in CatBoost version. Original notebook version: 3.3.2'

In [4]:
#Check GPU availability
try:
    subprocess.check_output('nvidia-smi')
    HAVE_GPU = True
except Exception:
    HAVE_GPU = False

print(f'GPU available: {HAVE_GPU}')

GPU available: False


In [5]:
SEED = 23
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [6]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [7]:
DATA_PATH = '/content/drive/MyDrive/data_science_competitions/zindi/trailblazers_open2all/data'
train = pd.read_csv(f'{DATA_PATH}/raw/train.csv')
test = pd.read_csv(f'{DATA_PATH}/raw/test.csv')
sample_sub = pd.read_csv(f'{DATA_PATH}/raw/sample_sub.csv')

In [8]:
NOTEBOOK = '02'
SUBMISSION_PATH = f'/content/drive/MyDrive/data_science_competitions/zindi/trailblazers_open2all/submissions/nb_{NOTEBOOK}'
if not os.path.isdir(SUBMISSION_PATH):
    os.makedirs(SUBMISSION_PATH)

# Data Preparation

In [9]:
TARGET = train['target']
TEST_INDEX = test['Place_ID X Date'] #for submission files
GROUPS = np.array(train['Place_ID']) #for GroupKFold cross-validation

In [10]:
def preprocess(df: pd.DataFrame, is_train: bool = False) -> pd.DataFrame:
    #Convert date column to datetime type
    df['Date'] = pd.to_datetime(df['Date'])

    #Create date-based features
    df['month'] = df['Date'].dt.month
    df['day'] = df['Date'].dt.day
    df['day_of_week'] = df['Date'].dt.day_of_week

    #dropping non-feature columns
    df = df.drop(labels=['Place_ID X Date', 'Place_ID', 'Date'], axis=1)
    if is_train:
        df = df.drop(
            labels=['target', 'target_min', 'target_max', 'target_variance', 'target_count'], 
            axis=1
        )
    
    #reduce memory usage
    def reduce_mem(df: pd.DataFrame) -> pd.DataFrame:
        for col in df.columns:
            col_type = df[col].dtypes
            if col_type in ['int64', 'float64']:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type).startswith('int'):
                    if c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
        return df
    
    return reduce_mem(df)

In [11]:
train = preprocess(train, is_train=True)
test = preprocess(test)

gc.collect()

18

# Baseline

In [12]:
%%time
scores_rmse = []
cv = GroupKFold(n_splits=5)
X, y = train, TARGET
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, GROUPS)):
    X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.loc[val_idx], y.iloc[val_idx]

    model = LGBMRegressor(
        objective='regression',
        boosting_type='goss',
        device_type='cpu',
        random_state=SEED
    ) 
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=50,
        eval_metric='rmse',
        verbose=0
    )
    val_preds = model.predict(X_val)
    score = mean_squared_error(y_val, val_preds, squared=False)
    scores_rmse.append(score)
    print(f'Fold #{fold}: ({model.best_iteration_} rounds) RMSE = {score:.5f}')
    _ = gc.collect()

print(f'\nAvg RMSE = {np.mean(scores_rmse):.5f} +/- {np.std(scores_rmse):.5f}\n')

Fold #0: (100 rounds) RMSE = 34.89684
Fold #1: (97 rounds) RMSE = 32.71991
Fold #2: (96 rounds) RMSE = 29.08208
Fold #3: (96 rounds) RMSE = 38.15012
Fold #4: (89 rounds) RMSE = 30.54660

Avg RMSE = 33.07911 +/- 3.21055

CPU times: user 15.5 s, sys: 107 ms, total: 15.6 s
Wall time: 8.53 s


# Hyperparameter tuning

In [13]:
def objective(trial, data, base_params):

    scores = []
    X, y = data

    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, step=0.01),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 200, step=0.1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 200, step=0.1),
        'num_leaves': trial.suggest_int('num_leaves', 100, 3000, step=5),
        # 'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 0, 1000, step=5),
        'min_split_gain': trial.suggest_float('min_split_gain', 0, 15, step=0.01),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.95, step=0.05),
        # 'subsample': trial.suggest_float('subsample', 0.5, 0.95, step=0.05), #not for GOSS
        # 'subsample_freq': trial.suggest_int('subsample_freq', 2, 25),
        'top_rate': trial.suggest_float('top_rate', 0.1, 0.5, step=0.05),
        'other_rate': trial.suggest_float('other_rate', 0.05, 0.5, step=0.05)
    }

    cv = GroupKFold(n_splits=5)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, GROUPS)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model = LGBMRegressor(**base_params, **param_grid)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='rmse',
            early_stopping_rounds=100,
            verbose=False
        )

        preds = model.predict(X_val)
        scores.append(mean_squared_error(y_val, preds, squared=False))
    return np.mean(scores)

In [14]:
def tune_params(data, base_params, n_trials=10, direction='maximize'):
    study = optuna.create_study(
        sampler=TPESampler(seed=SEED),
        pruner=HyperbandPruner(),
        direction=direction
    )
    
    study.optimize(
        func=lambda trial: objective(trial, data, base_params),
        n_trials=n_trials,
        gc_after_trial=True
    )
    
    return study

# Cross-validation

In [15]:
def evaluate_model(data, model_params, verbose=True):
    oof_preds = {}  #out-of-fold predictions on train set
    test_preds = {} #predictions on test set for each fold
    scores_rmse = [] #RMSE scores on validation set

    X, X_test, y = data

    cv = GroupKFold(n_splits=5)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, GROUPS)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model = LGBMRegressor(**model_params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='rmse',
            early_stopping_rounds=100,
            verbose=False
        )
        val_preds = model.predict(X_val)
        oof_preds.update(dict(zip(val_idx, val_preds)))
        test_preds[f'fold{fold}'] = model.predict(X_test)

        score = mean_squared_error(y_val, val_preds, squared=False)
        scores_rmse.append(score)
        if verbose:
            print(f'Fold #{fold}: ({model.best_iteration_} rounds) RMSE = {score:.5f}')
        
        _ = gc.collect()

    print(f'\nAvg RMSE = {np.mean(scores_rmse):.5f} +/- {np.std(scores_rmse):.5f}')
    
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'OOF RMSE = {mean_squared_error(y, oof_preds, squared=False):.5f}')
    
    test_preds = pd.DataFrame.from_dict(test_preds)
    return oof_preds, test_preds

In [16]:
def run_experiment(data, n_trials=5):
        
    X, X_test, y = data
    
    base_params = {
        'objective': 'regression',
        'n_estimators': 10000,
        'boosting_type': 'goss',
        'extra_trees': True,
        'verbosity': -1,
        'random_state': SEED
    }
    
    print(f'---------------Hyperparameter tuning---------------')
    study = tune_params(
        data=(X, y), 
        base_params=base_params,
        n_trials=n_trials,
        direction='minimize'
    )
    print(f'Best trial: {study.best_trial.number} -> Best value(RMSE): {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:20} - {v}')
    
    model_params = {**base_params, **study.best_params}
    print(f'-----------------Cross-validation------------------')
    oof_preds, test_preds = evaluate_model(
        data=(X, X_test, y), 
        model_params=model_params
    )
    return oof_preds, test_preds

In [17]:
%%time
oof_preds, test_preds = run_experiment(data=(train, test, TARGET), n_trials=50)

[32m[I 2022-09-25 04:36:24,551][0m A new study created in memory with name: no-name-b610ea52-344f-40b7-8709-5fc4cdfb9f4e[0m


---------------Hyperparameter tuning---------------


[32m[I 2022-09-25 04:37:24,718][0m Trial 0 finished with value: 32.026635480181696 and parameters: {'learning_rate': 0.060000000000000005, 'reg_alpha': 189.4, 'reg_lambda': 153.1, 'num_leaves': 920, 'min_child_samples': 220, 'min_split_gain': 10.3, 'colsample_bytree': 0.55, 'top_rate': 0.25, 'other_rate': 0.35000000000000003}. Best is trial 0 with value: 32.026635480181696.[0m
[32m[I 2022-09-25 04:38:33,366][0m Trial 1 finished with value: 32.05208494339203 and parameters: {'learning_rate': 0.05, 'reg_alpha': 0.4, 'reg_lambda': 176.8, 'num_leaves': 2670, 'min_child_samples': 300, 'min_split_gain': 8.84, 'colsample_bytree': 0.95, 'top_rate': 0.45000000000000007, 'other_rate': 0.05}. Best is trial 0 with value: 32.026635480181696.[0m
[32m[I 2022-09-25 04:40:34,785][0m Trial 2 finished with value: 31.651630706390996 and parameters: {'learning_rate': 0.03, 'reg_alpha': 57.6, 'reg_lambda': 164.5, 'num_leaves': 1915, 'min_child_samples': 110, 'min_split_gain': 0.0, 'colsample_bytree'

Best trial: 14 -> Best value(RMSE): 31.04986
Best hyperparameters:
learning_rate        - 0.01
reg_alpha            - 41.2
reg_lambda           - 128.9
num_leaves           - 2305
min_child_samples    - 30
min_split_gain       - 6.16
colsample_bytree     - 0.8500000000000001
top_rate             - 0.35
other_rate           - 0.45
-----------------Cross-validation------------------
Fold #0: (9657 rounds) RMSE = 32.81238
Fold #1: (10000 rounds) RMSE = 30.25674
Fold #2: (6770 rounds) RMSE = 27.32033
Fold #3: (9998 rounds) RMSE = 36.13793
Fold #4: (7245 rounds) RMSE = 28.72193

Avg RMSE = 31.04986 +/- 3.12898
OOF RMSE = 31.20457
CPU times: user 6h 33min 17s, sys: 8min 48s, total: 6h 42min 5s
Wall time: 3h 25min 8s


# Generating submission files

In [18]:
for col in (test_preds.columns):
    sub = pd.DataFrame({'Place_ID X Date': TEST_INDEX, 'target': test_preds[col]})
    sub.to_csv(f'{SUBMISSION_PATH}/{col}.csv', index=False)

sub = pd.DataFrame({'Place_ID X Date': TEST_INDEX, 'target': test_preds.mean(axis=1)})
sub.to_csv(f'{SUBMISSION_PATH}/mean.csv', index=False)