In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import cross_val_score,KFold,StratifiedKFold
from pathlib import Path
from sklearn.preprocessing import StandardScaler

In [None]:
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')

In [None]:
test.head()

In [None]:
train.head()

In [None]:
train.describe().T

In [None]:
print("train shape: ",train.shape)

In [None]:
test.shape

In [None]:
train.info(),test.info()

In [None]:
X = train.drop(['id','loss'],axis=1)
y = train['loss']
X_test = test.drop(['id'],axis=1)

In [None]:
train["loss"].describe()

In [None]:
sum(train.isnull().sum())

In [None]:
sum(test.isnull().sum())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
fig = plt.figure(figsize= (14,8))
target_cnt = train.loss.value_counts().sort_index()
sns.barplot(x=target_cnt.index,y=target_cnt)

In [None]:
fig = plt.figure(figsize= (14,8))
target_cnt = train.loss.value_counts().sort_index()
sns.regplot(x=target_cnt.index,y=target_cnt)

In [None]:
ss = StandardScaler()
X = ss.fit_transform(X)
X_test = ss.transform(X_test)

In [None]:
y_min = y.min()
y_max = y.max()

In [None]:
from sklearn.metrics import mean_squared_error
def my_rmse(y_true,y_hat):
    y_true[y_true < y_min] = y_min
    y_true[y_true > y_max] = y_max
    
    y_hat[y_hat<y_min] = y_min
    y_hat[y_hat > y_max] = y_max
    
    return mean_squared_error(y_true,y_hat,squared=False)

In [None]:
from sklearn.model_selection import train_test_split,cross_val_score
import xgboost as xgb
import numpy as np

def objective(trial):
    X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.2,stratify = y)
    param_grid = {
        'tweedie_variance_power': trial.suggest_discrete_uniform('tweedie_variance_power', 1.0, 2.0, 0.1),
        'max_depth' : trial.suggest_int('max_depth',6,10),
        'n_estimators' : trial.suggest_int('n_estimators',400,4000,400),
        'eta' : trial.suggest_float('eta',0.007,0.013),
        'subsample' : trial.suggest_discrete_uniform('subsample',0.5,0.9,0.1),
        'colsample_bytree' : trial.suggest_discrete_uniform('colsample_bytree',0.5,0.9,0.1),
        'min_child_weight' : trial.suggest_int('min_child_weight',5,20),
        'reg_lambda' : trial.suggest_int('reg_lambda',1,50),
        'reg_alpha' : trial.suggest_int('reg_alpha',0,50),
        'gamma': trial.suggest_loguniform('gamma', 1e-4, 1e4)
        
    }
    
    reg = xgb.XGBRegressor(
    objective='reg:tweedie',    
    tree_method= 'gpu_hist',
    predictor = 'gpu_predictor',
    n_jobs = 4,
    **param_grid
    )
    
    reg.fit(X_train,y_train,
           eval_set = [(X_valid,y_valid)], eval_metric='rmse',verbose=False)
    
    return my_rmse(y_valid,reg.predict(X_valid))

In [None]:
from optuna.samplers import TPESampler
train_time = 1 * 60 * 60
study = optuna.create_study(direction = 'minimize',study_name='XGBRegressor',sampler = TPESampler())
study.optimize(objective,timeout = train_time)
print('Number of finished trials: ', len(study.trials))
print('Best trial:')
trial = study.best_trial

print('\tValue: {}'.format(trial.value))
print('\tParams: ')
for key, value in trial.params.items():
    print('\t\t{}: {}'.format(key, value))

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
xgb_params = trial.params
xgb_params['objective'] = 'reg:tweedie'
xgb_params['tree_method'] = 'gpu_hist'
xgb_params['predictor'] = 'gpu_predictor'
xgb_params['n_jobs'] = 4

n_splits = 10
test_preds = None
kf_rmse = []

for fold, (train_idx,valid_idx) in enumerate(KFold(n_splits,shuffle=True).split(X,y)):
    X_train,y_train = X[train_idx],y[train_idx]
    X_valid,y_valid = X[valid_idx],y[valid_idx]
    
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(X_train,y_train,
             eval_set = [(X_valid,y_valid)],
             eval_metric='rmse',verbose=False)
    valid_pred = model.predict(X_valid)
    rmse = my_rmse(y_valid,valid_pred)
    print(f'Fold {fold+1}/{n_splits} RMSE: {rmse:.4f}')
    kf_rmse.append(rmse)
    
    if test_preds is None:
        test_preds = model.predict(X_test)
    else:
        test_preds += model.predict(X_test)

test_preds /= n_splits
print(f'Average KFold RMSE : {np.mean(np.array(kf_rmse)):.5f}')
         
    

In [None]:
print(f'Average KFold RMSE : {np.mean(np.array(kf_rmse)):.5f}')

In [None]:
test_preds[test_preds < y_min] = y_min
test_preds[test_preds > y_max] = y_max
sample_submission['loss'] = test_preds
sample_submission.to_csv('submission.csv', index=False)
sample_submission