In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1) Load Library & Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
sns.set(font_scale=2)

import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)

import missingno as msno

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

pd.set_option('display.max_columns', 100)

In [None]:
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')

In [None]:
print("Train Dataset : {} rows x {} columns".format(train.shape[0], train.shape[1]))
print("Test  Dataset : {} rows x {} columns".format(test.shape[0], test.shape[1]))

In [None]:
X_train = train.drop(columns=['id', 'loss'])
Y_train = train.drop(columns=['id'])['loss']

In [None]:
X_train.head(5)

In [None]:
TestID = test['id']
X_test = test.drop(columns=['id'])

There is no missing values.

In [None]:
train.isnull().sum().sum()

In [None]:
test.isnull().sum().sum()

In [None]:
train.info()

# 2) EDA

In [None]:
train.describe()

## Target Value Distribution
- 'loss'(==target value,Y) are distributed as follows

In [None]:
Y_train.value_counts()

In [None]:
data = [
        go.Bar(
            x=train['loss'].value_counts().sort_index().index,
            y=train['loss'].value_counts().sort_index().values,
            
        )
]
layout = go.Layout(
    title='Target Variable(loss) Distribution',
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig)

- Target Value's Ratio

In [None]:
ratio = [(x / train.shape[0] * 100) for x in train["loss"].value_counts().sort_index().values]

for i, r in enumerate(ratio):
  print("{} : {:2.2f}%".format(train['loss'].value_counts().sort_index().index[i], ratio[i]))

In [None]:
labels = list(range(0,43))
frequency = ratio
 
colors = sns.color_palette('hls',len(labels))
 
fig = plt.figure(figsize=(25,25))
fig.set_facecolor('white')
ax = fig.add_subplot()
 
pie = ax.pie(frequency, startangle=90, counterclock=False, autopct=lambda p : '{:.2f}%'.format(p), colors = colors)
 
plt.legend(pie[0],labels)
plt.show()

## Feature Distribution

In [None]:
from sklearn.preprocessing import StandardScaler
StdScaler = StandardScaler()

cols = ['f{}'.format(i) for i in range(100)]
train[cols] = StdScaler.fit_transform(train[cols])
test[cols] = StdScaler.fit_transform(test[cols])

train[cols] = np.log1p(train[cols])
test[cols] = np.log1p(test[cols])

In [None]:
"""
fig, ax = plt.subplots(10, 10, figsize=(18,18))

for i in range(0, 10):
  for j in range(0, 10):
    sns.kdeplot(data=train, x='f{}'.format(i*10+j), fill=True, color='Red', shade=True, ax=ax[i,j])
    sns.kdeplot(data=test, x='f{}'.format(i*10+j), fill=True, color='Blue', shade=True, ax=ax[i,j])
    
    ax[i,j].set_xticks([])
    ax[i,j].set_yticks([])
    ax[i,j].set_xlabel('')
    ax[i,j].set_ylabel('')
    ax[i,j].set_title('f{}'.format(i*10+j), loc='center', fontsize=15)
"""

- Compare the distribution of Train and Test.

In [None]:
# f0 ~ f9
fig, ax = plt.subplots(2, 10, figsize=(18,4))

for i in range(0,10):
    sns.kdeplot(data=train, x='f{}'.format(i), fill=True, color='Red', shade=True, ax=ax[0,i])
    sns.kdeplot(data=test, x='f{}'.format(i), fill=True, color='Blue', shade=True, ax=ax[1,i])
    
    for j in range(0,2):
        ax[j,i].set_xticks([])
        ax[j,i].set_yticks([])
        ax[j,i].set_xlabel('')
        ax[j,i].set_ylabel('')
        ax[j,i].set_title('f{}'.format(i), loc='center', fontsize=15)

In [None]:
# f10 ~ f19
fig, ax = plt.subplots(2, 10, figsize=(18,4))

for i in range(0,10):
    f_name = i+10
    sns.kdeplot(data=train, x='f{}'.format(f_name), fill=True, color='Red', shade=True, ax=ax[0,i])
    sns.kdeplot(data=test, x='f{}'.format(f_name), fill=True, color='Blue', shade=True, ax=ax[1,i])
    
    for j in range(0,2):
        ax[j,i].set_xticks([])
        ax[j,i].set_yticks([])
        ax[j,i].set_xlabel('')
        ax[j,i].set_ylabel('')
        ax[j,i].set_title('f{}'.format(f_name), loc='center', fontsize=15)

In [None]:
# f20 ~ f29
fig, ax = plt.subplots(2, 10, figsize=(18,4))

for i in range(0,10):
    f_name = i+20
    sns.kdeplot(data=train, x='f{}'.format(f_name), fill=True, color='Red', shade=True, ax=ax[0,i])
    sns.kdeplot(data=test, x='f{}'.format(f_name), fill=True, color='Blue', shade=True, ax=ax[1,i])
    
    for j in range(0,2):
        ax[j,i].set_xticks([])
        ax[j,i].set_yticks([])
        ax[j,i].set_xlabel('')
        ax[j,i].set_ylabel('')
        ax[j,i].set_title('f{}'.format(f_name), loc='center', fontsize=15)

In [None]:
# f30 ~ f39
fig, ax = plt.subplots(2, 10, figsize=(18,4))

for i in range(0,10):
    f_name = i+30
    sns.kdeplot(data=train, x='f{}'.format(f_name), fill=True, color='Red', shade=True, ax=ax[0,i])
    sns.kdeplot(data=test, x='f{}'.format(f_name), fill=True, color='Blue', shade=True, ax=ax[1,i])
    
    for j in range(0,2):
        ax[j,i].set_xticks([])
        ax[j,i].set_yticks([])
        ax[j,i].set_xlabel('')
        ax[j,i].set_ylabel('')
        ax[j,i].set_title('f{}'.format(f_name), loc='center', fontsize=15)

In [None]:
# f40 ~ f49
fig, ax = plt.subplots(2, 10, figsize=(18,4))

for i in range(0,10):
    f_name = i+40
    sns.kdeplot(data=train, x='f{}'.format(f_name), fill=True, color='Red', shade=True, ax=ax[0,i])
    sns.kdeplot(data=test, x='f{}'.format(f_name), fill=True, color='Blue', shade=True, ax=ax[1,i])
    
    for j in range(0,2):
        ax[j,i].set_xticks([])
        ax[j,i].set_yticks([])
        ax[j,i].set_xlabel('')
        ax[j,i].set_ylabel('')
        ax[j,i].set_title('f{}'.format(f_name), loc='center', fontsize=15)

In [None]:
# f50 ~ f59
fig, ax = plt.subplots(2, 10, figsize=(18,4))

for i in range(0,10):
    f_name = i+50
    sns.kdeplot(data=train, x='f{}'.format(f_name), fill=True, color='Red', shade=True, ax=ax[0,i])
    sns.kdeplot(data=test, x='f{}'.format(f_name), fill=True, color='Blue', shade=True, ax=ax[1,i])
    
    for j in range(0,2):
        ax[j,i].set_xticks([])
        ax[j,i].set_yticks([])
        ax[j,i].set_xlabel('')
        ax[j,i].set_ylabel('')
        ax[j,i].set_title('f{}'.format(f_name), loc='center', fontsize=15)

In [None]:
# f60 ~ f69
fig, ax = plt.subplots(2, 10, figsize=(18,4))

for i in range(0,10):
    f_name = i+60
    sns.kdeplot(data=train, x='f{}'.format(f_name), fill=True, color='Red', shade=True, ax=ax[0,i])
    sns.kdeplot(data=test, x='f{}'.format(f_name), fill=True, color='Blue', shade=True, ax=ax[1,i])
    
    for j in range(0,2):
        ax[j,i].set_xticks([])
        ax[j,i].set_yticks([])
        ax[j,i].set_xlabel('')
        ax[j,i].set_ylabel('')
        ax[j,i].set_title('f{}'.format(f_name), loc='center', fontsize=15)

In [None]:
# f70 ~ f79
fig, ax = plt.subplots(2, 10, figsize=(18,4))

for i in range(0,10):
    f_name = i+70
    sns.kdeplot(data=train, x='f{}'.format(f_name), fill=True, color='Red', shade=True, ax=ax[0,i])
    sns.kdeplot(data=test, x='f{}'.format(f_name), fill=True, color='Blue', shade=True, ax=ax[1,i])
    
    for j in range(0,2):
        ax[j,i].set_xticks([])
        ax[j,i].set_yticks([])
        ax[j,i].set_xlabel('')
        ax[j,i].set_ylabel('')
        ax[j,i].set_title('f{}'.format(f_name), loc='center', fontsize=15)

In [None]:
# f80 ~ f89
fig, ax = plt.subplots(2, 10, figsize=(18,4))

for i in range(0,10):
    f_name = i+80
    sns.kdeplot(data=train, x='f{}'.format(f_name), fill=True, color='Red', shade=True, ax=ax[0,i])
    sns.kdeplot(data=test, x='f{}'.format(f_name), fill=True, color='Blue', shade=True, ax=ax[1,i])
    
    for j in range(0,2):
        ax[j,i].set_xticks([])
        ax[j,i].set_yticks([])
        ax[j,i].set_xlabel('')
        ax[j,i].set_ylabel('')
        ax[j,i].set_title('f{}'.format(f_name), loc='center', fontsize=15)

In [None]:
# f90 ~ f99
fig, ax = plt.subplots(2, 10, figsize=(18,4))

for i in range(0,10):
    f_name = i+90
    sns.kdeplot(data=train, x='f{}'.format(f_name), fill=True, color='Red', shade=True, ax=ax[0,i])
    sns.kdeplot(data=test, x='f{}'.format(f_name), fill=True, color='Blue', shade=True, ax=ax[1,i])
    
    for j in range(0,2):
        ax[j,i].set_xticks([])
        ax[j,i].set_yticks([])
        ax[j,i].set_xlabel('')
        ax[j,i].set_ylabel('')
        ax[j,i].set_title('f{}'.format(f_name), loc='center', fontsize=15)

# 3) Base Modeling

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
def eval_rmse(model, x, y):
  pred = model.predict(x)
  return np.sqrt(mean_squared_error(y, pred))

In [None]:
tr_X, val_X, tr_Y, val_Y = train_test_split(X_train, Y_train, random_state=26)

In [None]:
base_model = XGBRegressor(n_estimators=2000,
                          learning_rate=0.08,
                          colsample_bytree= 0.22,
                          subsample=0.99,
                          random_state=1,
                          reg_alpha = 5,
                          tree_method = 'gpu_hist')

base_model.fit(tr_X, tr_Y, early_stopping_rounds = 70, eval_set=[(val_X, val_Y)], eval_metric="rmse", verbose=False)

In [None]:
print(f"Base Model(XGB Regressor) RMSE : {eval_rmse(base_model, val_X, val_Y)}")

# 4) Optuna (XGBRegressor)

In [None]:
import optuna

In [None]:
def objectiveXGB(trial,data,target):
    
    X_tr, X_tst, y_tr, y_tst = train_test_split(data, target, test_size=0.25,random_state=42)
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 500, 4000),
        "max_depth": trial.suggest_int("max_depth", 5, 20),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 300),
        "gamma": trial.suggest_int("gamma", 1, 3),
        "learning_rate": 0.01,
        "colsample_bytree": trial.suggest_discrete_uniform("colsample_bytree",0.5,1,0.1),
        "subsample": trial.suggest_discrete_uniform("subsample", 0.6, 1, 0.1),
        "eta": trial.suggest_loguniform("eta",1e-3,0.1),
        "reg_alpha": trial.suggest_int("reg_alpha",1,50),
        "reg_lambda": trial.suggest_int("reg_lambda",5,100),
    }
    model = XGBRegressor(**params, tree_method='gpu_hist', random_state=42)
    model.fit(X_tr,y_tr,eval_set=[(X_tst,y_tst)],verbose = False,eval_metric='rmse')

    y_preds = model.predict(X_tst)
    loss = np.sqrt(mean_squared_error(y_tst, y_preds))
    
    return loss

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objectiveXGB(trial, X_train, Y_train), n_trials=50)
print("========================================================================================================================")
print('Best trial - XGBoost')
print('score : {}'.format(study.best_trial.value))
print('params : {}'.format(study.best_trial.params))
print("========================================================================================================================")

In [None]:
from sklearn.model_selection import StratifiedKFold

xgb_params = study.best_trial.params
xgb_params['tree_method'] = 'gpu_hist'
xgb_params['random_state'] = 156
preds=None
n_folds = 10

kf = StratifiedKFold(n_splits = n_folds , shuffle = True , random_state = 156)

for fold, (tr_idx , val_idx) in enumerate(kf.split(X_train.values , Y_train.values)):
    
    print("Fold {}".format(fold + 1))
    X_tr, X_val = X_train.values[tr_idx] , X_train.values[val_idx]
    Y_tr, Y_val = Y_train.values[tr_idx] , Y_train.values[val_idx]
        
    eval_set = [(X_val, Y_val)]
    
    model = XGBRegressor(**xgb_params)
    model.fit(X_tr, Y_tr, eval_set = eval_set, verbose = False)
    
    #tr_preds = model.predict(X_tr)    
    val_preds = model.predict(X_val)
    
    print(np.sqrt(mean_squared_error(Y_val, val_preds)))
    
    if preds is None:
        preds = model.predict(X_test.values)
    else:
        preds += model.predict(X_test.values)

preds = preds / n_folds

In [None]:
"""
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

from sklearn.metrics import make_scorer
def root_mean_squared_error(y_true, y_predict): #RMSE
    return np.sqrt(mean_squared_error(y_true, y_predict)) 
rmse = make_scorer(root_mean_squared_error,greater_is_better=False)

xgb1 = XGBRegressor()
parameters = {'nthread':[4],
              'objective':['reg:linear'],
              'learning_rate': [.03, 0.05, .07],
              'max_depth': [5, 6, 7],
              'min_child_weight': [4],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500],
              'tree_method': ['gpu_hist']}

xgb_grid = GridSearchCV(xgb1,param_grid=parameters, scoring=rmse, cv=5, n_jobs=5, verbose=True)

xgb_grid.fit(X_train, Y_train)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)
"""

In [None]:
"""
best_estimator = xgb_grid.best_estimator_
preds = best_estimator.predict(X_test)
"""

In [None]:
pred_df = pd.DataFrame()
pred_df["id"] = test["id"]
pred_df["loss"] = preds

pred_df.to_csv('submission_xgb.csv', index=False, header=pred_df.columns)
pred_df.head(10)

# 4) Optuna (LightGBM)

In [None]:
from lightgbm import LGBMRegressor

def objectiveLGBM(trial,data,target):
    X_tr, X_tst, y_tr, y_tst = train_test_split(data, target, test_size=0.25,random_state=42)
    params = {
        'metric': 'rmse', 
        'random_state': 48,
        "n_estimators": trial.suggest_int("n_estimators", 500, 4000),
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0,
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.2, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.4, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 0.5),
        'max_depth': trial.suggest_categorical('max_depth', [5,10,20,40,100, -1]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    model = LGBMRegressor(**params)
    model.fit(X_tr,y_tr,eval_set=[(X_tst,y_tst)],verbose = False, early_stopping_rounds=30)

    y_preds = model.predict(X_tst)
    loss = np.sqrt(mean_squared_error(y_tst, y_preds))
    
    return loss

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objectiveLGBM(trial, X_train, Y_train), n_trials=50)
print("========================================================================================================================")
print('Best trial - LightGBM')
print('score : {}'.format(study.best_trial.value))
print('params : {}'.format(study.best_trial.params))
print("========================================================================================================================")

In [None]:
from sklearn.model_selection import StratifiedKFold

lgbm_params = study.best_trial.params
preds=None
n_folds = 10

kf = StratifiedKFold(n_splits = n_folds , shuffle = True , random_state = 156)

for fold, (tr_idx , val_idx) in enumerate(kf.split(X_train.values , Y_train.values)):
    
    print("Fold {}".format(fold + 1))
    X_tr, X_val = X_train.values[tr_idx] , X_train.values[val_idx]
    Y_tr, Y_val = Y_train.values[tr_idx] , Y_train.values[val_idx]
        
    eval_set = [(X_val, Y_val)]
    
    model = LGBMRegressor(**lgbm_params)
    model.fit(X_tr, Y_tr, eval_set = eval_set, verbose = False, early_stopping_rounds=30)
    
    #tr_preds = model.predict(X_tr)    
    val_preds = model.predict(X_val)
    
    print(np.sqrt(mean_squared_error(Y_val, val_preds)))
    
    if preds is None:
        preds = model.predict(X_test.values)
    else:
        preds += model.predict(X_test.values)

preds = preds / n_folds

In [None]:
pred_df = pd.DataFrame()
pred_df["id"] = test["id"]
pred_df["loss"] = preds

pred_df.to_csv('submission_lgbm.csv', index=False, header=pred_df.columns)
pred_df.head(10)