In [None]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

parameters = {'axes.grid': True}
plt.rcParams.update(parameters)

import optuna
from optuna.samplers import TPESampler
import lightgbm as lgb
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import mean_squared_error

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')

In [None]:
df_train.head()

In [None]:
# Removing 'id' column

df_train.drop('id', axis = 1, inplace = True)

In [None]:
# Checking 'nulls'

df_train.isnull().sum().max() == 0

In [None]:
df_test.head()

In [None]:
# Removing 'id' column

df_test.drop('id', axis = 1, inplace = True)

In [None]:
# Checking 'nulls'

df_test.isnull().sum().max() == 0

In [None]:
# Plotting some graphs of random features in train set 

fig = plt.figure(figsize = (15, 10))
for j in [j for j in range(1, 16)]:
    i = np.random.randint(0, df_train.columns.size - 1)
    plt.subplot(3, 5, j)
    sns.kdeplot(x = df_train[df_train.columns[i]])
    plt.title(df_train.columns[i])
fig.tight_layout()
print('15 graphs of random features in train set')
plt.show()

In [None]:
# Plotting graph of target

plt.figure(figsize = (15, 5))
sns.histplot(x = df_train['loss'], kde = True)
plt.title('Distribution of target (loss)')

In [None]:
df_train_no_target = df_train.drop('loss', axis = 1)

In [None]:
# Standarization

scaler = StandardScaler()

df_train_no_target_scal = pd.DataFrame(scaler.fit_transform(df_train_no_target), columns = df_train_no_target.columns)
df_test = pd.DataFrame(scaler.fit_transform(df_test), columns = df_test.columns)

In [None]:
df_train_no_target_scal.head()

In [None]:
df_test.head()

In [None]:
X = df_train_no_target_scal
y = df_train['loss']

In [None]:
# Division data into train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 15)

In [None]:
# Prediction by default settings

model = lgb.LGBMRegressor(random_state = 10, silent = False)
model.fit(X_train, y_train)
preds_test = model.predict(X_test)
preds_train = model.predict(X_train)

In [None]:
# RMSE score

print(f" Test RMSE score:     {np.sqrt(mean_squared_error(y_test, preds_test))}")
print(f" Train RMSE score:    {np.sqrt(mean_squared_error(y_train, preds_train))}")

In [None]:
def check_model(model = model, n_splits = 10):
    
    scores= []
    cv = KFold(n_splits, shuffle = True)
    
    for train_idx, test_idx in cv.split(X):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        score = np.sqrt(mean_squared_error(y_test, preds))
        scores.append(score)
        
    print('************************************')    
    print(f"Mean RMSE score:       {np.mean(scores)}")
    print(f"Std RMSE:              {np.std(scores)}")

In [None]:
# Mean RSME score 

check_model(model)

In [None]:
def objective(trial):
    params = {'n_estimators': 100,
              'subsample': trial.suggest_discrete_uniform('subsample', 0.7, 0.9, 0.1),
              'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.7, 0.9, 0.1),
              'max_depth': trial.suggest_int("max_depth", 5, 50, 15),
              'reg_alpha': trial.suggest_int("reg_alpha", 0, 60, 20),
              'reg_lambda': trial.suggest_int('reg_lambda', 0, 60, 20),
              'learning_rate': trial.suggest_loguniform('learning_rate', 0.05, 0.3)}
        
    model = lgb.LGBMRegressor(**params, random_state = 17)
    
    scores= []
    cv = KFold(n_splits = 10, shuffle = True)
    
    for train_idx, test_idx in cv.split(X):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        
        score = np.sqrt(mean_squared_error(y_test, preds))
        scores.append(score / cv.n_splits)
    
    return sum(scores)

In [None]:
# Improving parameters by optuna

study = optuna.create_study(direction = "minimize", sampler = TPESampler())
study.optimize(objective, n_trials = 150)

In [None]:
params = study.best_params
params

In [None]:
# Mean RSME score with improved parameters

check_model(model = lgb.LGBMRegressor(n_estimators = 100, **params))

In [None]:
model = lgb.LGBMRegressor(n_estimators = 100, **params)
model.fit(X, y)
preds = model.predict(df_test)
preds = preds.astype(int)

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')
sub['loss'] = preds
sub.sample(5)

In [None]:
sub.to_csv('LightGBM+Optuna.csv', index = False)