<h3>Import libraries<h3>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
import lightgbm as lgb
import optuna
from sklearn.metrics import log_loss,accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

<h3>Data loading<h3>

In [None]:
train_df = pd.read_csv("../input/tabular-playground-series-aug-2021/train.csv")
test_df = pd.read_csv("../input/tabular-playground-series-aug-2021/test.csv")
sub = pd.read_csv("../input/tabular-playground-series-aug-2021/sample_submission.csv")

In [None]:
print('Train:',train_df.shape)
print('Test:',test_df.shape)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_X = train_df.drop(["id","loss"],axis=1)
train_Y = train_df["loss"]
test_X = test_df.drop(["id"],axis=1)

<h3>Optuna<h3>

In [None]:
X_train,X_valid,y_train,y_valid = train_test_split(train_X,train_Y,test_size=0.2,random_state=1234,shuffle=False,stratify=None)

def objective(trial):
    params = {
        "objective":"regression",
        'metric':'RMSE',
        'boosting_type':'gbdt',     
        "n_estimators":1000,        
        "learning_rate":trial.suggest_loguniform("learning_rate",0.001,0.5), 
        "num_leaves":trial.suggest_int("num_leaves",2,256),
        "max_bin":trial.suggest_int("max_bin",200,500),        
        "bagging_fraction":trial.suggest_uniform("bagging_fraction",0.4,1.0),
        "bagging_freq":trial.suggest_int("bagging_freq",1,10),
        "feature_fraction":trial.suggest_uniform("feature_fraction",0.4,1.0),
        "min_data_in_leaf":trial.suggest_int("min_data_in_leaf",2,16),                
        "min_sum_hessian_in_leaf":trial.suggest_int("min_sum_hessian_in_leaf",1,10),
    }
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)    
    
    model_lgb = lgb.train(params, lgb_train, 
                          valid_sets=lgb_eval, 
                          num_boost_round=100,
                          early_stopping_rounds=20,
                          verbose_eval=0,)    
    
    y_pred = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration)
    score =  np.sqrt(mean_squared_error(y_valid, y_pred))
    
    return score

study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=0))
study.optimize(objective,n_trials=50)
study.best_params

In [None]:
lgb_params = {
    "objective":"regression",
    'metric':'RMSE',
    'boosting_type':'gbdt', 
    "random_seed":1234,
    'learning_rate': 0.01450919354574922,
    'num_leaves': 132,
    'max_bin': 361,
    'bagging_fraction': 0.8088355063623027,
    'bagging_freq': 3,
    'feature_fraction': 0.4773163392797921,
    'min_data_in_leaf': 7,
    'min_sum_hessian_in_leaf': 10
    }

<h3>LGBM</h3>

In [None]:
kf = KFold(n_splits=10)
models = []
rmses = []
oof = np.zeros(len(train_X))

for Fold, (train_index,val_index) in enumerate(kf.split(train_X)):
    print('Fold:',Fold)
    X_train = train_X.iloc[train_index]
    X_valid = train_X.iloc[val_index]
    y_train = train_Y.iloc[train_index]
    y_valid = train_Y.iloc[val_index]

    lgb_train = lgb.Dataset(X_train,y_train)
    lgb_eval = lgb.Dataset(X_valid,y_valid,reference=lgb_train)
    
    model_lgb = lgb.train(
        lgb_params,
        lgb_train,
        valid_sets=lgb_eval,
        num_boost_round=1000,
        early_stopping_rounds=300,
        verbose_eval=False
    )
    
    y_pred = model_lgb.predict(X_valid,num_iteration=model_lgb.best_iteration)
    tmp_rmse = np.sqrt(mean_squared_error(y_valid,y_pred))
    print("mean_squard_error:",tmp_rmse)
    print("="*50)
    models.append(model_lgb)
    rmses.append(tmp_rmse)
    oof[val_index] = y_pred

In [None]:
sum(rmses)/len(rmses)

<h3>Prediction</h3>

In [None]:
preds = []

for model in models:
    pred = model.predict(test_X)
    preds.append(pred)  

preds_array = np.array(preds)
preds_mean = np.mean(preds_array,axis=0)

<h3>Submission</h3>

In [None]:
sub['loss'] = preds_mean
sub.to_csv('submission_lgb.csv',index=False)