In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import random
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, KFold, LeaveOneGroupOut
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import optuna

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/test.csv")

In [None]:
print('train shape:',train.shape)
print('test shape:',test.shape)

In [None]:
train.info

In [None]:
test.info

In [None]:
train.head(10)

In [None]:
target = train['loss']
train.drop(['id'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

In [None]:
train.isnull().sum()

There are no missing value in the datasets.

# EDA


**Target (Loss) Distribution**
* In total, there are 43 discrete losses.
* The first 15 distributions account for the vast majority of the total.
* The percentage share of losses decreases with the number of columns for all but columns 2 and 1.

In [None]:
fig, ax = plt.subplots(figsize=(16, 8))

bars = ax.bar(train["loss"].value_counts().sort_index().index,
              train["loss"].value_counts().sort_index().values,
              color=["deepskyblue" if i%2==0 else "darkorange" for i in range(9)],
              edgecolor="black")
ax.set_title("Loss Distribution", fontsize=20, pad=15)
ax.set_ylabel("Amount of Values", fontsize=14, labelpad=15)
ax.set_xlabel("Loss Value", fontsize=14, labelpad=10)
ax.bar_label(bars, [f"{x:2.2f}%" for x in train["loss"].value_counts().sort_index().values/(len(train)/100)],
                 padding=5, fontsize=10, rotation=90)
ax.grid(axis="y")

plt.show();

In [None]:
target_df = pd.DataFrame(train["loss"].value_counts())
target_df['ratio(%)'] = target_df/train["loss"].value_counts().sum()*100
target_df.sort_values('ratio(%)', ascending=False, inplace=True)
target_df

**Data Statistics**
* The size of this dataset is very diverse.

In [None]:
train.describe().T

**Feature Distribution**
* The data set is very balanced.
* This means that the distribution of training and testing is almost identical.

In [None]:
fig, axes = plt.subplots(10,10,figsize=(16, 16))
axes = axes.flatten()

for idx, ax in enumerate(axes):
    sns.kdeplot(data=train, x=f'f{idx}', 
                fill=True,  
                ax=ax)
    sns.kdeplot(data=test, x=f'f{idx}', 
                fill=True, 
                ax=ax)
    ax.spines['left'].set_visible(False)
    ax.set_title(f'f{idx}', loc='right', weight='bold', fontsize=10)

fig.supxlabel('Average by class (by feature)', ha='center', fontweight='bold')

fig.tight_layout()
plt.show()

**Feature Correlation**
* The correlation of the features can be found in the heat map between ~0.03 and ~0.03.
* This is quite small. So these features are weakly correlated.

In [None]:
fig, ax = plt.subplots(figsize=(12 , 12))

corr = train.corr()

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(corr, ax=ax,
        square=True, center=0, linewidth=1,
        cmap=sns.diverging_palette(240, 10, as_cmap=True),
        cbar_kws={"shrink": .82},    
        mask=mask
       ) 

ax.set_title("Feature Correlation", fontsize=15)     

plt.show()

# Data Preprocessing

In [None]:
x_scaler = StandardScaler()
X = pd.DataFrame(x_scaler.fit_transform(train.drop(["loss"], axis=1)), columns=train.drop(["loss"], axis=1).columns)
X_test = pd.DataFrame(x_scaler.transform(test), columns=test.columns)

y = train["loss"].copy()

In [None]:
X.describe()

In [None]:
X_test.describe()

# Hyperparameters optimization

Some ideas are from https://www.kaggle.com/maximkazantsev/tps-08-21-xgboost#Data-preparation

In [None]:
def train_model_optuna(trial):
    """
    A function to train a model using different hyperparamerters combinations provided by Optuna. 
    Loss of validation data predictions is returned to estimate hyperparameters effectiveness.
    """
    preds = 0
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, test_size=0.4)
        
    #A set of hyperparameters to optimize by optuna
    xgb_params = {
                 "n_estimators": trial.suggest_categorical('n_estimators', [10000]),
                 "learning_rate": trial.suggest_float('learning_rate', 0.01, 0.8),
                 "subsample": trial.suggest_float('subsample', 0.5, 0.95),
                 "colsample_bytree": trial.suggest_float('colsample_bytree', 0.5, 0.95),
                 "max_depth": trial.suggest_int("max_depth", 5, 16),
                 "booster": trial.suggest_categorical('booster', ["gbtree"]),
                 "tree_method": trial.suggest_categorical('tree_method', ["gpu_hist"]),
                 "reg_lambda": trial.suggest_float('reg_lambda', 2, 100),
                 "reg_alpha": trial.suggest_float('reg_alpha', 1, 50),
                 "random_state": trial.suggest_categorical('random_state', [42]),
                 "n_jobs": trial.suggest_categorical('n_jobs', [4]),
                    }

    # Model loading and training
    model = XGBRegressor(**xgb_params)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric="rmse",
              early_stopping_rounds=100,
              verbose=False)
    
    print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)
    oof[oof<0] = 0
    
    return np.sqrt(mean_squared_error(y_valid, oof))

In [None]:
# Splitting data into train and valid folds using target bins for stratification
time_limit = 600
study = optuna.create_study(direction='minimize',study_name='XGBRegressor')
study.optimize(train_model_optuna,
 #                n_trials = 100,
                timeout=time_limit)

# Showing optimization results
print('Number of finished trials:', len(study.trials))
print('Best trial parameters:', study.best_trial.params)
print('Best score:', study.best_value)

In [None]:
xgb_params = {'n_estimators': 10000, 
              'learning_rate': 0.010363073485236518, 
              'subsample': 0.8674277491335747, 
              'colsample_bytree': 0.8405044208369009, 
              'max_depth': 12, 
              'booster': 'gbtree', 
              'tree_method': 'gpu_hist', 
              'reg_lambda': 75.59819964912153, 
              'reg_alpha': 2.336931955219609, 
              'random_state': 42, 
              'n_jobs': 4}

# Train Model

In [None]:
%%time
splits = 10
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
oof_preds = np.zeros((X.shape[0],))
preds = 0
model_fi = 0
total_mean_rmse = 0

for num, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.loc[train_idx], X.loc[valid_idx]
    y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]
    
    model = XGBRegressor(**xgb_params)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric="rmse",
              early_stopping_rounds=100,
              verbose=False)
    
    preds += model.predict(X_test) / splits
    model_fi += model.feature_importances_
    oof_preds[valid_idx] = model.predict(X_valid)
    oof_preds[oof_preds < 0] = 0
#     fold_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(np.array(y_valid).reshape(-1,1)), y_scaler.inverse_transform(np.array(oof_preds[valid_idx]).reshape(-1,1))))
    fold_rmse = np.sqrt(mean_squared_error(y_valid, oof_preds[valid_idx]))
    print(f"Fold {num} RMSE: {fold_rmse}")
#         print(f"Trees: {model.tree_count_}")
    total_mean_rmse += fold_rmse / splits
print(f"\nOverall RMSE: {total_mean_rmse}")    

# Feature Importance

In [None]:
df = pd.DataFrame(columns=["Feature", "Importance"])
df["Feature"] = X.columns
df["Importance"] = model_fi / model_fi.sum()
df.sort_values("Importance", axis=0, ascending=False, inplace=True)

In [None]:
df = pd.DataFrame(columns=["Feature", "Importance"])
df["Feature"] = X.columns
df["Importance"] = model_fi / model_fi.sum()
df.sort_values("Importance", axis=0, ascending=False, inplace=True)

x = np.arange(0, len(df["Feature"]))
height = 0.4

fig, ax = plt.subplots(figsize=(16, 30))
bars1 = ax.barh(x, df["Importance"], height=height,
                color="mediumorchid", edgecolor="black")
ax.set_title("Feature importances", fontsize=30, pad=15)
ax.set_ylabel("Feature names", fontsize=20, labelpad=15)
ax.set_xlabel("Feature importance", fontsize=20, labelpad=15)
ax.set_yticks(x)
ax.set_yticklabels(df["Feature"], fontsize=15)
ax.tick_params(axis="x", labelsize=15)
ax.grid(axis="x")
ax2 = ax.secondary_xaxis('top')
ax2.set_xlabel("Feature importance", fontsize=20, labelpad=15)
ax2.tick_params(axis="x", labelsize=15)
plt.margins(0.04, 0.01)
plt.gca().invert_yaxis()

# Prediction

In [None]:
test1 = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/test.csv")

In [None]:
predictions = pd.DataFrame()
predictions["id"] = test1["id"]
predictions["loss"] = preds
predictions.head()

In [None]:
predictions.to_csv('xgbsubmission.csv', index=False, header=predictions.columns)