In [None]:
# import standard libraries
import pandas as pd
import numpy as np
import lightgbm as lgbm
import optuna
import matplotlib.pyplot as plt
from sklearn import model_selection, metrics
import seaborn as sns

# show all columns
pd.set_option('max_columns', None)

In [None]:
# read the data
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')
sample = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')

# drop id columns from train and test sets
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

# Simple EDA

In [None]:
# histogramse for all variables with KDE
plt.figure(figsize=(24, 6*(104/4)))
for i in range(len(train.columns.tolist())):
    plt.subplot(26, 4, i+1)
    if i <= 99:
        sns.histplot(train[f'f{i}'], kde=True)
    else:
        sns.histplot(train['loss'], kde=True)
plt.show()

In [None]:
# correlation matrix with heat map
corr = train.corr()
plt.figure(figsize=(20, 20))
sns.heatmap(corr)
plt.show()

In [None]:
# the ten highest correlated features for each feature
cols = train.columns.tolist()
for col in cols:
    print(col)
    print(corr[col].sort_values(ascending=False)[1:11])
    print('=======================')

In [None]:
# extract X and y for training set
X = train.drop('loss', axis=1).values
y = train['loss'].values

# LightGBM with Optimized Hyperparameters

In [None]:
# optimized hyperparameters
params = {
        "min_child_weight": 638.7295413674256,
        "num_leaves": 32,
        "reg_alpha": 0.7635991288488166,
        "reg_lambda": 93.08626337603258
        }

# construct the model
model= lgbm.LGBMRegressor(
                       **params,
                       objective='rmse',
                       metric='rmse',
                       subsample=0.7,
                       learning_rate=0.03,
                       n_estimators=10000,
                       n_jobs=-1
                       )

# construct KFold cross validation
n_splits=5
kf = model_selection.KFold(n_splits=n_splits)

# initiate lists to save folds scores
scores_train = []
scores_valid = []

# initiate zeros array for test data predictions
preds_test_array = np.zeros((test.shape[0], ))

# KFold cross validation 
for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):

    print(f"Fold {fold+1} -------------->")
    x_train, y_train = X[train_idx], y[train_idx]
    x_valid, y_valid = X[valid_idx], y[valid_idx]

    y_train_log = y_train
    y_valid_log = y_valid
    
    # fit the model
    model.fit(
            x_train, y_train_log,
            eval_set=[(x_valid,y_valid_log)],
            verbose=100,
            early_stopping_rounds=100
            )

    # clip the results so that the minimum and maximum values are 0 and 50, respectively
    preds_train = np.clip(model.predict(x_train), 0, 50)
    preds_valid = np.clip(model.predict(x_valid), 0, 50)
    preds_test = np.clip(model.predict(test), 0, 50)
    
    # add the predictions of each fold to the array
    preds_test_array += preds_test / n_splits
    
    # find both train and test rsme and observe if there is overfitting
    score_train = np.sqrt(metrics.mean_squared_error(y_train, preds_train))
    score_valid = np.sqrt(metrics.mean_squared_error(y_valid, preds_valid))
    
    # print the fold score
    print(score_valid)
    
    # append the fold score
    scores_train.append(score_train)
    scores_valid.append(score_valid)

print('Mean train score =', np.mean(scores_train), 'STD train =', np.std(scores_train, ddof=1))
print('Mean valid score =', np.mean(scores_valid), 'STD valid =', np.std(scores_valid, ddof=1))

# populate the submission dataframe
sample.iloc[:, 1] = preds_test_array
sample.to_csv('lgbm_base_model_submission.csv', index=False)

Train and validation RMSE's are so close, hence the odds of overfitting is small.