In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import random
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, QuantileTransformer
import optuna


# Pandas setting to display more dataset rows and columns
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## **Data import**

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/train.csv", low_memory=False)#, nrows=10000)
# train["date_time"] = pd.to_datetime(train["date_time"], format="%Y-%m-%d %H:%M:%S")
test = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/test.csv", low_memory=False)
# test["date_time"] = pd.to_datetime(test["date_time"], format="%Y-%m-%d %H:%M:%S")
train.info(memory_usage="deep")

In [None]:
test.info(memory_usage="deep")

In [None]:
train.head(10)

# **EDA**

In [None]:
# Colors to be used for plots
colors = ["lightcoral", "sandybrown", "darkorange", "mediumseagreen",
          "lightseagreen", "cornflowerblue", "mediumpurple", "palevioletred",
          "lightskyblue", "sandybrown", "yellowgreen", "indianred",
          "lightsteelblue", "mediumorchid", "deepskyblue"]

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
pie = ax.pie([len(train), len(test)],
             labels=["Train dataset", "Test dataset"],
             colors=["salmon", "teal"],
             textprops={"fontsize": 15},
             autopct='%1.1f%%')
ax.axis("equal")
ax.set_title("Dataset length comparison", fontsize=18)
fig.set_facecolor('white')
plt.show();

In [None]:
train.describe().T

In [None]:
train.isna().sum().sum(), test.isna().sum().sum()

There are no missing value in the both datasets.

Let's check target distribution.

In [None]:
train["loss"].value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(16, 8))

bars = ax.bar(train["loss"].value_counts().sort_index().index,
              train["loss"].value_counts().sort_index().values,
              color=colors,
              edgecolor="black")
ax.set_title("Loss (target) distribution", fontsize=20, pad=15)
ax.set_ylabel("Amount of values", fontsize=14, labelpad=15)
ax.set_xlabel("Loss (target) value", fontsize=14, labelpad=10)
ax.bar_label(bars, [f"{x:2.2f}%" for x in train["loss"].value_counts().sort_index().values/(len(train)/100)],
                 padding=5, fontsize=10, rotation=90)
ax.margins(0.025, 0.12)
ax.grid(axis="y")

plt.show();

Let's check feature values distribution in the both datasets.

In [None]:
df = pd.concat([train.drop(["id", "loss"], axis=1), test.drop("id", axis=1)], axis=0)
columns = df.columns.values

cols = 3
rows = len(columns) // cols + 1

fig, axs = plt.subplots(ncols=cols, nrows=rows, figsize=(16,100), sharex=False)

plt.subplots_adjust(hspace = 0.3)
i=0

for r in np.arange(0, rows, 1):
    for c in np.arange(0, cols, 1):
        if i >= len(columns):
            axs[r, c].set_visible(False)
        else:
            hist1 = axs[r, c].hist(train[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="deepskyblue",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Train Dataset")
            hist2 = axs[r, c].hist(test[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="palevioletred",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Test Dataset")
            axs[r, c].set_title(columns[i], fontsize=14, pad=5)
            axs[r, c].tick_params(axis="y", labelsize=13)
            axs[r, c].tick_params(axis="x", labelsize=13)
            axs[r, c].grid(axis="y")
            axs[r, c].legend(fontsize=13)
                                  
        i+=1
#plt.suptitle("Feature values distribution in both datasets", y=0.99)
plt.show();

The datasets are pretty well balanced.

In [None]:
train.nunique().sort_values().head()

As you can see, f1 feature has the smallest amount of unique values - 289. So I don't think any feature should be treated as categorical.

Let's look at feature correlation.

In [None]:
# Plot dataframe
df = train.drop("id", axis=1).corr().round(5)

# Mask to hide upper-right part of plot as it is a duplicate
mask = np.zeros_like(df)
mask[np.triu_indices_from(mask)] = True

# Making a plot
plt.figure(figsize=(16,16))
ax = sns.heatmap(df, annot=False, mask=mask, cmap="RdBu", annot_kws={"weight": "bold", "fontsize":13})
ax.set_title("Feature correlation heatmap", fontsize=17)
plt.setp(ax.get_xticklabels(), rotation=90, ha="right",
         rotation_mode="anchor", weight="normal")
plt.setp(ax.get_yticklabels(), weight="normal",
         rotation_mode="anchor", rotation=0, ha="right")
plt.show();

As you can see, the correlation is between ~0.03 and ~0.03 which is pretty small. So the features are weakly correlated. 

There are some features with relatively low correlation with target value even comparing with other features:

In [None]:
df[(df["loss"]>-0.001) & (df["loss"]<0.001)]["loss"]

Let's visualize each feature vs loss.

In [None]:
columns = train.drop(["id", "loss"], axis=1).columns.values

cols = 4
rows = len(columns) // cols + 1

fig, axs = plt.subplots(ncols=cols, nrows=rows, figsize=(16,100), sharex=False)

plt.subplots_adjust(hspace = 0.3)
i=0

for r in np.arange(0, rows, 1):
    for c in np.arange(0, cols, 1):
        if i >= len(columns):
            axs[r, c].set_visible(False)
        else:
            scatter = axs[r, c].scatter(train[columns[i]].values,
                                        train["loss"],
                                        color=random.choice(colors))
            axs[r, c].set_title(columns[i], fontsize=14, pad=5)
            axs[r, c].tick_params(axis="y", labelsize=11)
            axs[r, c].tick_params(axis="x", labelsize=11)
                                  
        i+=1
#plt.suptitle("Features vs loss", y=0.99)
plt.show();

# **Data preprocessing**

In [None]:
# Calculating edges of target bins to be used for stratified split
target_bin_edges = np.histogram_bin_edges(train["loss"], bins=10)
target_bin_edges[0] = -np.inf
target_bin_edges[-1] = np.inf
target_bins = pd.cut(train["loss"], target_bin_edges, labels=np.arange(10))
target_bins.value_counts()

In [None]:
# Scaling data
x_scaler = StandardScaler()
X = pd.DataFrame(x_scaler.fit_transform(train.drop(["id", "loss"], axis=1)), columns=train.drop(["id", "loss"], axis=1).columns)
X_test = pd.DataFrame(x_scaler.transform(test.drop("id", axis=1)), columns=test.drop(["id"], axis=1).columns)

y = train["loss"].copy()

In [None]:
X.describe()

In [None]:
X_test.describe()

In [None]:
y.min(), y.max()

# **Hyperparameters optimization**

In [None]:
def train_model_optuna(trial, X_train, X_valid, y_train, y_valid):
    """
    A function to train a model using different hyperparamerters combinations provided by Optuna. 
    Loss of validation data predictions is returned to estimate hyperparameters effectiveness.
    """
    preds = 0
    
        
    #A set of hyperparameters to optimize by optuna
    xgb_params = {
                 "n_estimators": trial.suggest_categorical('n_estimators', [40000]),
                 "learning_rate": trial.suggest_float('learning_rate', 0.01, 1.0, step=0.01),
                 "subsample": trial.suggest_float('subsample', 0.5, 1, step=0.01),
                 "colsample_bytree": trial.suggest_float('colsample_bytree', 0.1, 1, step=0.01),
                 "max_depth": trial.suggest_int("max_depth", 1, 16),
                 "booster": trial.suggest_categorical('booster', ["gbtree"]),
                 "tree_method": trial.suggest_categorical('tree_method', ["gpu_hist"]),
                 "reg_lambda": trial.suggest_float('reg_lambda', 0.2, 100, step=0.1),
                 "reg_alpha": trial.suggest_float('reg_alpha', 0.1, 50, step=0.1),
                 "random_state": trial.suggest_categorical('random_state', [42]),
                 "n_jobs": trial.suggest_categorical('n_jobs', [4]),
#                  "min_child_weight": trial.suggest_int("min_child_weight", 10, 30),
                    }

    # Model loading and training
    model = XGBRegressor(**xgb_params)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric="rmse",
              early_stopping_rounds=100,
              verbose=False)
    
    print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)
    oof[oof<0] = 0
    
    return np.sqrt(mean_squared_error(y_valid, oof))

The code below is commented in order to save runtime.

In [None]:
# %%time
# # Splitting data into train and valid folds using target bins for stratification
# split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# for train_idx, valid_idx in split.split(X, target_bins):
#     X_train, X_valid = X.loc[train_idx], X.loc[valid_idx]
#     y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]
# # Setting optuna verbosity to show only warning messages
# # If the line is uncommeted each iteration results will be shown
# # optuna.logging.set_verbosity(optuna.logging.WARNING)
# time_limit = 3600 * 4
# study = optuna.create_study(direction='minimize')
# study.optimize(lambda trial: train_model_optuna(trial, X_train, X_valid,
#                                                     y_train, y_valid),
# #                n_trials = 100,
#                timeout=time_limit
#               )

# # Showing optimization results
# print('Number of finished trials:', len(study.trials))
# print('Best trial parameters:', study.best_trial.params)
# print('Best score:', study.best_value)

# **Model training**

In [None]:
# Hyperparameters optimized by Optuna

xgb_params = {'n_estimators': 40000,
              'learning_rate': 0.01,
              'subsample': 0.72,
              'colsample_bytree': 0.66,
              'max_depth': 6,
              'booster': 'gbtree',
              'tree_method': 'gpu_hist',
              'reg_lambda': 68.5,
              'reg_alpha': 21.5,
              'random_state': 42,
              'n_jobs': 4}

In [None]:
%%time
splits = 10
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
oof_preds = np.zeros((X.shape[0],))
preds = 0
model_fi = 0
total_mean_rmse = 0

for num, (train_idx, valid_idx) in enumerate(skf.split(X, target_bins)):
    X_train, X_valid = X.loc[train_idx], X.loc[valid_idx]
    y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]
    
    model = XGBRegressor(**xgb_params)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric="rmse",
              early_stopping_rounds=100,
              verbose=False)
    
    preds += model.predict(X_test) / splits
    model_fi += model.feature_importances_
    oof_preds[valid_idx] = model.predict(X_valid)
    oof_preds[oof_preds < 0] = 0
#     fold_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(np.array(y_valid).reshape(-1,1)), y_scaler.inverse_transform(np.array(oof_preds[valid_idx]).reshape(-1,1))))
    fold_rmse = np.sqrt(mean_squared_error(y_valid, oof_preds[valid_idx]))
    print(f"Fold {num} RMSE: {fold_rmse}")
#         print(f"Trees: {model.tree_count_}")
    total_mean_rmse += fold_rmse / splits
print(f"\nOverall RMSE: {total_mean_rmse}")    

## **Feature importances**

In [None]:
df = pd.DataFrame(columns=["Feature", "Importance"])
df["Feature"] = X.columns
df["Importance"] = model_fi / model_fi.sum()
df.sort_values("Importance", axis=0, ascending=False, inplace=True)

x = np.arange(0, len(df["Feature"]))
height = 0.4

fig, ax = plt.subplots(figsize=(16, 30))
bars1 = ax.barh(x, df["Importance"], height=height,
                color="mediumorchid", edgecolor="black")
ax.set_title("Feature importances", fontsize=30, pad=15)
ax.set_ylabel("Feature names", fontsize=20, labelpad=15)
ax.set_xlabel("Feature importance", fontsize=20, labelpad=15)
ax.set_yticks(x)
ax.set_yticklabels(df["Feature"], fontsize=15)
ax.tick_params(axis="x", labelsize=15)
ax.grid(axis="x")
ax2 = ax.secondary_xaxis('top')
ax2.set_xlabel("Feature importance", fontsize=20, labelpad=15)
ax2.tick_params(axis="x", labelsize=15)
plt.margins(0.04, 0.01)
plt.gca().invert_yaxis()

## **Submission**

In [None]:
predictions = pd.DataFrame()
predictions["id"] = test["id"]
predictions["loss"] = preds

predictions.to_csv('submission.csv', index=False, header=predictions.columns)
predictions.head()