# LightGBM on Tabular Playground Series - Feb 2021

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor

In [None]:
train = pd.read_csv("../input/tabular-playground-series-feb-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-feb-2021/test.csv")

In [None]:
category_features = [
    "cat0", "cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7", 
    "cat8", "cat9"
]

continous_features = [
    "cont0", "cont1", "cont2", "cont3", "cont4",
    "cont5", "cont6", "cont7", "cont8", "cont9", "cont10", 
    "cont11", "cont12", "cont13"
]

all_features = category_features + continous_features

# boosted trees is not very sensitive to feature engineering

In [None]:
# enum of string type to categoricals of int type (lgbm can handle categoricals, but requires it in int type)

for feature in category_features:
    encoder = LabelEncoder()
    encoder.fit(train[feature])
    train[feature] = pd.Series(encoder.transform(train[feature]), dtype="category")
    test[feature] = pd.Series(encoder.transform(test[feature]), dtype="category")

In [None]:
# https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html

PARAMS = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting': 'gbdt',
    # iteration
    'num_iterations': 5000,
    'learning_rate': 0.02,
    # regulation
    'num_leaves': 15,
    'min_data_in_leaf': 1000, # scale of 300000 / num_leaves
    'feature_fraction': 0.3,
    # explicit regulation
    'lambda_l2': 0.001
}

# 'gbdt' performs better than 'goss'
# so far, num_leaves, min_data_in_leaf, and feature_fraction appears to be significant on result

In [None]:
# manual parameters tuning via k-fold, ideally this should be some grid search

PARAMS_KFOLD = PARAMS.copy()
PARAMS_KFOLD.update({
    'early_stopping_round': 500 # about 10% of num_iterations
})

def rmse_kfold(parameters, n_fold):
    kfold = KFold(n_splits=n_fold)

    rmse_kfold = np.zeros(n_fold)

    for index, (train_index, validation_index) in enumerate(kfold.split(train)):
        X_train, X_validation = train[all_features].iloc[train_index], train[all_features].iloc[validation_index]
        y_train, y_validation = train['target'].iloc[train_index], train['target'].iloc[validation_index]

        model = LGBMRegressor(**parameters)

        model.fit(X_train, y_train,
                  eval_set = [(X_validation, y_validation)],
                  verbose = -1)

        pred_validation = model.predict(X_validation)

        rmse_kfold[index] = mean_squared_error(y_validation, pred_validation, squared=False)

    rmse_average = np.average(rmse_kfold)
    
    return rmse_average

In [None]:
# a rudimental grid search for parameters tuning, however it is too slow (even on 'goss')

num_leaves_range = [2**3-1, 2**4-1, 2**5-1, 2**6-1]
min_data_in_leaf_range = [2000, 1000, 500, 250]
feature_fraction_range = [0.3]

N_FOLD = 10

rmse_grid_search = []

for num_leaves in num_leaves_range:
    for min_data_in_leaf in min_data_in_leaf_range:
        for feature_fraction in feature_fraction_range:
            params = PARAMS_KFOLD.copy()
            params.update({
                'num_leaves': num_leaves,
                'min_data_in_leaf': min_data_in_leaf,
                'feature_fraction': feature_fraction
            })
            
            rmse = rmse_kfold(params, N_FOLD)
            
            rmse_grid_search.append((rmse, params))
            
            print("parameters " + str(params))
            print("rmse: " + str(rmse))

In [None]:
# pick parameters with the best score

PARAMS_OPTIMIZED = sorted(rmse_grid_search, key=lambda o1: o1[0])[0][1]

In [None]:
# use the tuned parameters to train a final model

PARAMS_PRED = PARAMS_OPTIMIZED.copy()
PARAMS_PRED.pop('early_stopping_round', None)

print('parameters for final model ' + str(PARAMS_PRED))

model = LGBMRegressor(**PARAMS_PRED)
    
model.fit(train[all_features], train['target'])

In [None]:
pred = model.predict(test[all_features])

submission = pd.DataFrame({'id': test['id'], 'target': pred})
submission.to_csv('submission.csv', index=False)