# TPS (August-2021)

## Importing Libraries and loading data in Pandas Dataframe

In [None]:
import numpy as np 
import pandas as pd 
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from matplotlib import pyplot as plt
import seaborn as sns
import optuna

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
df_train = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/train.csv')
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/test.csv')
df_sample_submission = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/sample_submission.csv')

print('Dataframes Created')

In [None]:
df_train.shape, df_test.shape

## EDA

In [None]:
from scipy import stats
from scipy.stats import norm

f,ax = plt.subplots(nrows=1,ncols=2,figsize=(16,8))
sns.histplot(x=df_train.loss,kde=True,ax=ax[0])
res=stats.probplot(df_train['loss'],plot=plt)


In [None]:
# cols = 4
# rows = int(len(features)/(cols+1))
# f,ax = plt.subplots(nrows=rows,ncols=cols,figsize=(80,160),sharex=False)
# plt.subplots_adjust(hspace = 0.3)

# i=0
# for r in range(0,rows,1):
#     for c in range(0,cols,1):
#         if i>=len(features):
#             ax[r, c].set_visible(False)
#         else:
#             scatter = ax[r, c].scatter(df_train[features[i]].values,
#                                         df_train["loss"],
#                                         )
#             ax[r, c].set_title(features[i], fontsize=14, pad=5)
#             ax[r, c].tick_params(axis="y", labelsize=11)
#             ax[r, c].tick_params(axis="x", labelsize=11)
                                  
#         i+=1
# plt.show()

In [None]:
features = [c for c in df_train.columns if c not in ('loss','kfold','id')]
hist_features = df_train[features].hist(figsize = (130, 160), bins=50, grid = False, xlabelsize=8, ylabelsize=8, layout = (101,4))

Looks like Tweedie distribution

## ML Model

In [None]:
# Kfold (fold=5)
df_train['kfold'] = -1
            
kf = KFold(n_splits=5,shuffle=True,random_state=42)
for fold, (idx_train,idx_valid) in enumerate(kf.split(df_train)):
    df_train.loc[idx_valid,'kfold'] = fold

# Selecting features
features = [c for c in df_train.columns if c not in ('loss','kfold','id')]
df_test = df_test[features]

df_train.head(3)

### Hyperparameter Optimization using Optim

In [None]:
def hp_optim(trial):
    preds = []
    scores = []
    X_train = df_train[df_train.kfold!=fold].reset_index(drop=True)
    y_train = X_train.loss

    X_valid = df_train[df_train.kfold==fold].reset_index(drop=True)
    y_valid = X_valid.loss

    X_test = df_test.copy()

    X_train = X_train[features]
    X_valid = X_valid[features]

    params = { 
                    'n_estimators': trial.suggest_int('n_estimators',400,10000,400),
                    "random_state": 42,
                    "tree_method": 'gpu_hist',
                    "predictor": "gpu_predictor",
                    "objective": "reg:squarederror",
                    "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.05),
                    "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.2, 0.6),
                    "subsample": trial.suggest_loguniform("subsample", 0.4, 0.8),
                    "alpha": trial.suggest_loguniform("alpha", 0.01, 10.0),
                    "lambda": trial.suggest_loguniform("lambda", 1e-8, 10.0),
                    "gamma": trial.suggest_loguniform("lambda", 1e-8, 10.0),
                    "min_child_weight": trial.suggest_loguniform("min_child_weight", 10, 1000),
                    "max_depth": trial.suggest_int("max_depth", 4, 12)
                }

    model = XGBRegressor(**params)
    model.fit(X_train,y_train, early_stopping_rounds=300, eval_set=[(X_valid, y_valid)], verbose=1000)

    ypred_valid = model.predict(X_valid)
    ypred_test = model.predict(X_test)

    preds.append(ypred_test)
    rmse = mean_squared_error(y_valid,ypred_valid,squared=False)
    scores.append(rmse)
        
    return np.mean(scores)

print('created')

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(hp_optim, n_trials=5)

In [None]:
print('Number of finished trials:', len(study.trials))
print("\nBest params: ",study.best_params)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
# See which hyperparameters are more important to tweak suggested ranges 
optuna.visualization.plot_param_importances(study)

### Final Model

In [None]:
preds = []

for f in range(5):
    X_train = df_train[df_train.kfold!=fold].reset_index(drop=True)
    y_train = X_train.loss

    X_valid = df_train[df_train.kfold==fold].reset_index(drop=True)
    y_valid = X_valid.loss

    X_test = df_test.copy()

    X_train = X_train[features]
    X_valid = X_valid[features]
    
    params={'n_estimators': 3600, 'learning_rate': 0.018352422312250018, 'colsample_bytree': 0.30280947434330335, 'subsample': 0.65778738291577, 'alpha': 0.016255685356625964, 'lambda': 5.112552884270486e-08, 'min_child_weight': 68.33661984654934, 'max_depth': 5}

    model = XGBRegressor(**params,random_state= 42)
    model.fit(X_train,y_train)

    ypred_valid = model.predict(X_valid)
    ypred_test = model.predict(X_test)

    preds.append(ypred_test)
        
    print(fold, mean_squared_error(y_valid, ypred_valid, squared=False))

preds = np.mean(np.column_stack(preds), axis=1)

In [None]:
df_sample_submission.head()

In [None]:
df_sample_submission.loss = preds
df_sample_submission.to_csv("submission_TPS_1.csv", index=False)