<font size="6">Simple LightGBM & KFold without feature engineering</font>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
train_raw = pd.read_csv("../input/tabular-playground-series-aug-2021/train.csv", index_col=0)
test_raw = pd.read_csv("../input/tabular-playground-series-aug-2021/test.csv", index_col=0)
submission = pd.read_csv("../input/tabular-playground-series-aug-2021/sample_submission.csv")

In [None]:
train_X = train_raw.drop("loss", axis=1)
train_y = train_raw["loss"] #.astype(int)
X_test = test_raw.copy()

<font size="5">Make model and prediction</font>

In [None]:
splits = 10
model_preds = 0
oof_preds = np.zeros(train_X.shape[0])
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=0)
evals_results = {}

for i, (train_idx, valid_idx) in enumerate(skf.split(train_X, train_y)):
    X_train, y_train = train_X.iloc[train_idx], train_y.iloc[train_idx]
    X_valid, y_valid = train_X.iloc[valid_idx], train_y.iloc[valid_idx]
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
    
    params = {
        "task": "train",
        "boosting_type": "gbdt",
        "objective": "regression",
        "metric": {"rmse"},
        "eta": 0.05,
        "max_depth": 40,
        "num_leaves": 10,
        "min_child_samples": 20,
        "feature_fraction": 0.5,
        "bagging_fraction": 0.5,
        "bagging_freq": 5,
        "verbose": -1
    }
    
    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=500,
        valid_names=["train", "valid"],
        valid_sets=[lgb_train, lgb_valid],
        evals_result=evals_results,
        early_stopping_rounds=50,
        verbose_eval=0
    )
    
    model_preds += (model.predict(X_test)) / splits
    oof_preds[valid_idx] = model.predict(X_valid)
    print("Fold {} RMSE : {}".format(i, np.sqrt(mean_squared_error(y_valid, oof_preds[valid_idx]))))
print("Total RMSE : {}".format(np.sqrt(mean_squared_error(train_y, oof_preds))))

In [None]:
plt.figure(figsize=(8, 5))
plt.title("RMSE")
plt.plot(evals_results["train"]["rmse"], label="train")
plt.plot(evals_results["valid"]["rmse"], label="valid")
plt.xlabel("num round")
plt.ylabel("loss")
plt.legend()

<font size="5">Feature importance</font>

In [None]:
importance = model.feature_importance(importance_type="gain")
feature = train_X.columns
importance_df = pd.DataFrame({"feature":feature, "importance":importance})
importance_df = importance_df.sort_values("importance")
importance_df.set_index("feature").iloc[-10:].plot(kind="barh", figsize=(10, 8),title="Top 10 feature importances", fontsize=10, colormap="summer")

<font size="5">Distribution of predicitions</font>

In [None]:
fig, ax = plt.subplots(figsize=(16, 4), ncols=2, nrows=1)
fig.suptitle("Loss prediction distribution", fontsize=15)
fig.subplots_adjust(top=0.8)
ax[0].hist(model_preds, bins=20, color="g", edgecolor="k")
ax[0].set_title("Test prediction")
ax[0].set_xlabel("Loss")
ax[1].hist(oof_preds, bins=20, color="y", edgecolor="k")
ax[1].set_title("Valid prediction")
ax[1].set_xlabel("Loss")

<font size="5">Submission</font>

In [None]:
submission["loss"] = model_preds
submission.to_csv("submission.csv", index=False)