In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("../input/tabular-playground-series-aug-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-aug-2021/test.csv")

In [None]:
train.isnull().sum().sum()

In [None]:
train

In [None]:
test.isnull().sum().sum()

In [None]:
x_train = train.drop(['id','loss'],axis=1)
y_train = train['loss']
test_id = test['id']
x_test = test.drop(['id'],axis=1)

In [None]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import optuna
from warnings import filterwarnings
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


filterwarnings("ignore", category=DeprecationWarning) 
filterwarnings("ignore", category=FutureWarning) 
filterwarnings("ignore", category=UserWarning)

In [None]:
X_train,X_val,Y_train,Y_val = train_test_split(x_train,y_train,random_state=26)

In [None]:
def check_rmse(model,x_val,y_val):
    pred = model.predict(x_val)
    return np.sqrt(mean_squared_error(y_val,pred))

## Base models with default params

In [None]:
knn_reg = KNeighborsRegressor(n_jobs=-1)
xgb_reg = XGBRegressor(n_jobs=-1)
lgbm_reg = LGBMRegressor(n_jobs=-1)
cat_reg =  CatBoostRegressor()

In [None]:
models = [knn_reg,xgb_reg,lgbm_reg,cat_reg]
models_name = ['knn_reg','xgb_reg','lgbm_reg','cat_reg']

In [None]:
rmse_error = []
for i,model in enumerate(models):
    model.fit(X_train,Y_train)
    rmse = check_rmse(model,X_val,Y_val)
    rmse_error.append(rmse)
    print(f"Model : {models_name[i]}   rmse = {rmse}")

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.barh(models_name,rmse_error)
plt.ylabel("Models")
plt.xlabel("RMSE") 
plt.show()

In [None]:
def submission(model,filename):
    pred = model.predict(x_test)
    pred = pd.DataFrame(pred,columns=['loss'])
    sub = pd.concat([test_id,pred],axis=1)
    sub.set_index('id',inplace=True)
    sub.to_csv(f"Submission_file_{filename}.csv")

In [None]:
X_train, X_eval, Y_train, Y_eval = train_test_split(X_train, Y_train, test_size=0.1, random_state=42)

In [None]:
def objective(trial):
    param = {}
    param['learning_rate'] = trial.suggest_discrete_uniform("learning_rate", 0.005, 0.2, 0.01)
    param['depth'] = trial.suggest_int('depth', 2, 4)
    param['l2_leaf_reg'] = trial.suggest_discrete_uniform('l2_leaf_reg', 4.0, 6.0, 0.5)
    param['min_child_samples'] = trial.suggest_categorical('min_child_samples', [1,2, 4])
    param['grow_policy'] = 'Depthwise'
    param['iterations'] = 8000
    param['use_best_model'] = True
    param['eval_metric'] = 'RMSE'
    param['od_type'] = 'iter'
    param['od_wait'] = 50
    param['random_state'] = 42
    param['logging_level'] = 'Silent'
    regressor = CatBoostRegressor(**param)
    regressor.fit(X_train.copy(), Y_train.copy(),
                  eval_set=[(X_eval.copy(), Y_eval.copy())],
                  early_stopping_rounds=80)
    return check_rmse(regressor,X_val,Y_val)


In [None]:
%%time
study = optuna.create_study(study_name='catboost-seed')
study.optimize(objective, n_trials=10000, n_jobs=-1, timeout=24000)

In [None]:
study.best_params

In [None]:
%%time
optimized_regressor = CatBoostRegressor(learning_rate=study.best_params['learning_rate'],
                                        depth=study.best_params['depth'],
                                        l2_leaf_reg=study.best_params['l2_leaf_reg'],
                                        min_child_samples=study.best_params['min_child_samples'],
                                        grow_policy='Depthwise',
                                        iterations=8000,
                                        use_best_model=True,
                                        eval_metric='RMSE',
                                        od_type='iter',
                                        od_wait=50,
                                        random_state=42,
                                        logging_level='Silent')
optimized_regressor.fit(X_train.copy(), Y_train.copy(),
                        eval_set=[(X_eval.copy(), Y_eval.copy())],
                        early_stopping_rounds=100)
pred_train = optimized_regressor.predict(X_train.copy())
print(f"rmse on training set : {check_rmse(optimized_regressor,X_train,Y_train)}")
print(f"rmse on valid set : {check_rmse(optimized_regressor,X_val,Y_val)}")

In [None]:
submission(cat_reg,"base_catboost")

In [None]:
submission(optimized_regressor,"Optimized_catboost")