This is a modified version of the following notebook: https://www.kaggle.com/hamzaghanmi/lgbm-hyperparameter-tuning-using-optuna

The main differences are the following:

1. Dummy encoding of the categorical variables
2. Modified ranges of the test options adn increased number of test rounds
3. Use of GPU

In [None]:
import optuna
from lightgbm import LGBMRegressor
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')
test  = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')
sub = pd.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv')

In [None]:
categorical_cols=['cat'+str(i) for i in range(10)]
continous_cols=['cont'+str(i) for i in range(14)]
target = train['target']

In [None]:
train_cat = train[categorical_cols].copy()
test_cat = test[categorical_cols].copy()

train_num = train[continous_cols].copy()
test_num = test[continous_cols].copy()
train_num.head()

In [None]:
data = pd.concat([train_cat, test_cat], axis=0)
data = pd.get_dummies(data)
train_cat = data.iloc[:len(train), ]
test_cat = data.iloc[:len(test), ]

In [None]:
train = pd.concat([train_cat, train_num], axis=1)
test = pd.concat([test_cat, test_num], axis=1)
train.head()

In [None]:
def objective(trial,data=train,target=target):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    param = {
        'metric': 'rmse', 
        'random_state': 48,
        'n_estimators': 1000,
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0,
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.2, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.4, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 0.5),
        'max_depth': trial.suggest_categorical('max_depth', [5,10,20,40,100, -1]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    model = LGBMRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=20,verbose=False)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [None]:
%%time
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
#plot_optimization_histor: shows the scores from all trials as well as the best score so far at each point.
optuna.visualization.plot_optimization_history(study)

In [None]:
#Visualize parameter importances.
optuna.visualization.plot_param_importances(study)

In [None]:
params=study.best_params   
params['random_state'] = 48
params['n_estimators'] = 25000 
params['learning_rate'] /= 25
params['cat_smooth'] = params.pop('min_data_per_groups')
params['metric'] = 'rmse'

In [None]:
preds = np.zeros(test.shape[0])
kf = KFold(n_splits=5,random_state=48,shuffle=True)
rmse=[]  # list contains rmse for each fold
n=0
for trn_idx, test_idx in kf.split(train,target):
    X_tr,X_val=train.iloc[trn_idx],train.iloc[test_idx]
    y_tr,y_val=target[trn_idx],target[test_idx]
    model = LGBMRegressor(**params)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=500,verbose=False)
    preds+=model.predict(test)/kf.n_splits
    rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
    print(n+1,rmse[n])
    n+=1

In [None]:
np.mean(rmse)

In [None]:
sub['target']=preds
sub.to_csv('submission.csv', index=False)