In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split,KFold
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder , RobustScaler , StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error as mse
import optuna

In [None]:
df = pd.read_csv('../input/10fold-30daysml/10fold_30dayml.csv')
test = pd.read_csv('../input/30-days-of-ml/test.csv')
ss = pd.read_csv('../input/30-days-of-ml/sample_submission.csv')

In [None]:
df.head()

In [None]:
useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
cat_lists = list(df.select_dtypes('object').columns)
df_test = test[useful_features]

In [None]:
def run(trial):
    total_rmse = []
    for i in range(10):
        params = {
            'max_depth': trial.suggest_int('max_depth', 6, 10), # Extremely prone to overfitting!
            'n_estimators': trial.suggest_int('n_estimators', 500,3000, 100), # Extremely prone to overfitting!
            'eta': trial.suggest_float('eta', 0.007, 0.013), # Most important parameter.
            'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 0.9, 0.1),
            'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 0.9, 0.1),
            'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 0.9, 0.1),
            'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 1e4), # I've had trouble with LB score until tuning this.
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 1e4), # L2 regularization
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 1e4), # L1 regularization
            'gamma': trial.suggest_loguniform('gamma', 1e-4, 1e4),
            'tree_method':'gpu_hist',
            'predictor':'gpu_predictor'
        }
        x_train = df[df.kfold !=i].reset_index(drop=True)
        x_valid = df[df.kfold ==i].reset_index(drop=True)
        
        ytrain = x_train.target
        yvalid = x_valid.target
        
        xtrain = x_train[useful_features]
        xvalid = x_valid[useful_features]
        
        #preprocessing
        ct = make_column_transformer(
            (OrdinalEncoder(),cat_lists),
            (StandardScaler(),['cont1','cont2','cont3','cont4','cont5','cont7','cont9','cont10','cont11','cont12','cont13']),
            (RobustScaler(),['cont0','cont6','cont8']),
            remainder='passthrough')
        xtrain = ct.fit_transform(xtrain)
        xvalid = ct.fit_transform(xvalid)
        
        model = XGBRegressor(**params)
        model.fit(xtrain,ytrain,eval_set=[(xvalid,yvalid)],early_stopping_rounds=300,verbose=1000)
        preds_valid = model.predict(xvalid)
        rmse = mse(yvalid,preds_valid,squared=False)
        total_rmse.append(rmse)
    return np.mean(total_rmse)
        

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(run,n_trials=50)

In [None]:
study.best_params

In [None]:
'''def objective(trial):
    total_rmse = []
    for i in range(10):
        params = {
            'max_depth': trial.suggest_int('max_depth', 6, 10), # Extremely prone to overfitting!
            'n_estimators': trial.suggest_int('n_estimators', 500,3000, 100), # Extremely prone to overfitting!
            'eta': trial.suggest_float('eta', 0.007, 0.013), # Most important parameter.
            'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 0.9, 0.1),
            'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 0.9, 0.1),
            'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 0.9, 0.1),
            'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 1e4), # I've had trouble with LB score until tuning this.
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 1e4), # L2 regularization
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 1e4), # L1 regularization
            'gamma': trial.suggest_loguniform('gamma', 1e-4, 1e4),
            'tree_method':'gpu_hist',
            'predictor':'gpu_predictor'
        }
        x_train = df[df.kfold !=i].reset_index(drop=True)
        x_valid = df[df.kfold ==i].reset_index(drop=True)
        
        ytrain = x_train.target
        yvalid = x_valid.target
        
        xtrain = x_train[useful_features]
        xvalid = x_valid[useful_features]
        
        #preprocessing
        ct = make_column_transformer(
            (OrdinalEncoder(),cat_lists),
            (StandardScaler(),['cont1','cont2','cont3','cont4','cont5','cont7','cont9','cont10','cont11','cont12','cont13']),
            (RobustScaler(),['cont0','cont6','cont8']),
            remainder='passthrough')
        xtrain = ct.fit_transform(xtrain)
        xvalid = ct.fit_transform(xvalid)
        
        model = XGBRegressor(**params)
        model.fit(xtrain,ytrain,eval_set=[(xvalid,yvalid)],eval_metric='rmse',verbose=False)
        preds_valid = model.predict(xvalid)
        rmse = mse(yvalid,preds_valid,squared=False)
        total_rmse.append(rmse)
    return np.mean(total_rmse)'''

In [None]:
'''def CallBack(study,frozen_trial):
        previous_best_value = study.user_attrs.get('previous_best_value',None)
        if previous_best_value != study.best_value:
            study.set_user_attr('previous_best_value',study.best_value)
            print( "Trial {} finished with best value: {}. ".format(
                frozen_trial.number,
                frozen_trial.value))'''

In [None]:
'''study_1 = optuna.create_study(direction='minimize',study_name='30ml')
func = lambda trial : objective(trial)
study.optimize(func,timeout=60*20,callbacks=[CallBack])'''