# Tabular Playground - LightGBM

[LightGBM](https://lightgbm.readthedocs.io/en/latest/) is a popular alterative to [XGBoost](https://xgboost.readthedocs.io/en/latest/), so lets see how it performs

# Dataset

In [None]:
import numpy  as np 
import pandas as pd 
import re
import sklearn
import lightgbm


pd.options.display.max_rows = 6

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-jan-2021/train.csv', index_col='id')
test_df  = pd.read_csv('../input/tabular-playground-series-jan-2021/test.csv',  index_col='id')

columns = test_df.columns
X       = train_df[columns]
Y       = train_df['target']
X_train, X_valid, Y_train, Y_valid = sklearn.model_selection.train_test_split(X, Y, test_size=0.1, random_state=42)
X_test  = test_df[columns]

display('train_df')
display( train_df )
# display('test_df')
# display( test_df )

# LightGBM

In [None]:
for seed in [42]:
# for boosting in ['gbdt', 'goss', 'dart']:                       # gbdt is best
# for max_depth in [1,2,4,6,8,10,12,16,32,64,-1]:                 # 4+ = 16 is best 
# for tree_learner in ['serial', 'feature', 'data', 'voting']:    # no effect
# for extra_trees in [True, False]:                               # no effect
# for learning_rate in [0.001, 0.01, 0.1, 0.5, 0.9]:              # 0.1   is best
# for max_bin in [64,128,256,512,1024,2048]:                      # 512-1 is best
# for num_leaves in [32, 64, 128, 256, 512, 1024, 2048, 4096]:    # 64-1  is best

    # DOCS: https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst
    # DOCS: https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html
    parameters = {
        # 'boosting_type':   boosting,
        # 'max_depth':       max_depth, 
        # 'tree_learner':    tree_learner,
        # 'extra_trees':     extra_trees,
        # 'learning_rate':   learning_rate,
        # 'max_bin':         max_bin-1,
        # 'num_leaves':      num_leaves-1,
    }
    
    # DOCS: https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.train.html
    model = lightgbm.train(
        {
            'boosting_type':  'gbdt',  # default
            'objective':      'regression',
            'metric':         'rmse',
            'learning_rate':   0.1,                     
            'max_depth':       16,
            'max_bin':         512-1,
            'num_leaves':      64-1,
            'seed':            42,
            'verbose':         -1,
            **parameters,
        },
        train_set  = lightgbm.Dataset(X_train, label=Y_train),
        valid_sets = lightgbm.Dataset(X_valid, label=Y_valid),
        num_boost_round       = 5000,
        early_stopping_rounds = 100,
        verbose_eval          = False
    )
    rmse = sklearn.metrics.mean_squared_error(Y_valid, model.predict(X_valid), squared=False)
    print(f'{rmse:.5f}', parameters)

# Submission

In [None]:
predictions   = model.predict(X_test)

submission_df = pd.read_csv('../input/tabular-playground-series-jan-2021/sample_submission.csv', index_col='id')
submission_df['target'] = predictions
submission_df.to_csv('submission.csv')
!head submission.csv

# Further Reading

This notebook is part of a series exploring the [Tabular Playground](https://www.kaggle.com/c/tabular-playground-series-jan-2021)
- 0.72935 - [scikit-learn Ensemble](https://www.kaggle.com/jamesmcguigan/tabular-playground-scikit-learn-ensemble)
- 0.71423 - [Fast.ai Tabular Solver](https://www.kaggle.com/jamesmcguigan/fast-ai-tabular-solver)
- 0.70426 - [XGBoost](https://www.kaggle.com/jamesmcguigan/tabular-playground-xgboost)
- [LightGBM](https://www.kaggle.com/jamesmcguigan/tabular-playground-lightgbm)