In [None]:
!pip install xgboost -q
!pip install catboost -q
!pip install lightgbm -q
!pip install hyperopt -q

In [None]:
from hyperopt import fmin, hp, tpe, STATUS_OK, STATUS_FAIL, Trials

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from IPython.display import clear_output

from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.svm import SVR

import lightgbm
import catboost
import xgboost

In [None]:
print(xgboost.__version__)

# Read the data

In [None]:
X_train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv', index_col='id')
X_test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv', index_col='id')

In [None]:
print(f"Train Shape: {X_train.shape}\nTest Shape: {X_test.shape}")

### Remove rows with missing target, separate target from predictors

In [None]:
y_train = X_train.loss
X_train.drop(['loss'], axis=1, inplace=True)

# Preprocess

In [None]:
X_train.info()

In [None]:
cat_cols = [col for col in X_train.columns if X_train[col].dtype=="O"]
num_cols = [col for col in X_train.columns if(X_train[col].dtype=="int64" or X_train[col].dtype=="float64")]

## Cap outliers

In [None]:
outlier_cols = num_cols
for col in outlier_cols:
    std = 1.5 * X_train[col].std()
    mean = X_train[col].mean()
    floor, ceil = mean - std, mean + std
    X_train[col] = X_train[col].clip(floor, ceil)

In [None]:
X_train[num_cols] = X_train[num_cols].astype('float32')

## Encode categorical columns

In [None]:
enc = OrdinalEncoder()
enc.fit(X_train[cat_cols])
print(X_train.shape)
X_cat_transformed = pd.DataFrame(enc.transform(X_train[cat_cols]))
X_cat_transformed.index = X_train.index
X_cat_transformed.columns = X_train[cat_cols].columns
X_train_final = X_train.drop(columns = cat_cols, axis=1).merge(X_cat_transformed, left_index = True, right_index = True)
X_train_final.shape

In [None]:
X_cat_transformed = pd.DataFrame(enc.transform(X_test[cat_cols]))
X_cat_transformed.index = X_test.index
X_cat_transformed.columns = X_test[cat_cols].columns
X_test_final = X_test.drop(columns = cat_cols, axis=1).merge(X_cat_transformed, left_index = True, right_index = True)
X_test_final.shape

# Modelling strategy

```python
X_train, X_valid, y_train, y_valid = train_test_split(X_train_final, y_train, train_size = 0.9)
```

### Hyperopt for finding the best hyperparameters for the model

```python
# XGB parameters
xgb_reg_params = {
    'learning_rate':    hp.choice('learning_rate',    np.arange(0.01, 0.31, 0.05)),
    'max_depth':        hp.choice('max_depth',        np.arange(3, 12, 1, dtype=int)),
    'min_child_weight': hp.choice('min_child_weight', np.arange(1, 5, 1, dtype=int)),
    'colsample_bytree': hp.choice('colsample_bytree', np.arange(0.3, 0.8, 0.1)),
    'subsample':        hp.uniform('subsample',       0.5, 1),
    'n_estimators':     2000,
    'reg_lambda':       hp.choice('reg_lambda',       [0, 0.1, 1.0, 5.0, 10.0, 50.0, 100.0]),
    'reg_alpha':        hp.choice('reg_alpha',        [0, 0.1, 1.0, 5.0, 10.0, 50.0, 100.0]),
}
xgb_fit_params = {
    'eval_metric': 'rmse',
    'early_stopping_rounds': 100,
    'verbose': False
}
xgb_para = dict()
xgb_para['reg_params'] = xgb_reg_params
xgb_para['fit_params'] = xgb_fit_params
xgb_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))


# LightGBM parameters
lgb_reg_params = {
    'learning_rate':    hp.choice('learning_rate',    np.arange(0.05, 0.31, 0.05)),
    'max_depth':        hp.choice('max_depth',        np.arange(3, 12, 1, dtype=int)),
    'min_child_weight': hp.choice('min_child_weight', np.arange(1, 5, 1, dtype=int)),
    'colsample_bytree': hp.choice('colsample_bytree', np.arange(0.3, 0.8, 0.1)),
    'subsample':        hp.uniform('subsample', 0.5, 1),
    'n_estimators':     2000,
    'reg_lambda':       hp.choice('reg_lambda', [0.1, 1.0, 5.0, 10.0, 50.0, 100.0]),
    'reg_alpha':        hp.choice('reg_alpha', [0.1, 1.0, 5.0, 10.0, 50.0, 100.0]),
}
lgb_fit_params = {
    'eval_metric': 'l2',
    'early_stopping_rounds': 100,
    'verbose': False
}
lgb_para = dict()
lgb_para['reg_params'] = lgb_reg_params
lgb_para['fit_params'] = lgb_fit_params
lgb_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))


# CatBoost parameters
ctb_reg_params = {
    'learning_rate':     hp.choice('learning_rate',     np.arange(0.05, 0.31, 0.05)),
    'max_depth':         hp.choice('max_depth',         np.arange(3, 12, 1, dtype=int)),
    'colsample_bylevel': hp.choice('colsample_bylevel', np.arange(0.3, 0.8, 0.1)),
    'n_estimators':      2000,
    'eval_metric':       'RMSE',
}
ctb_fit_params = {
    'early_stopping_rounds': 100,
    'verbose': False
}
ctb_para = dict()
ctb_para['reg_params'] = ctb_reg_params
ctb_para['fit_params'] = ctb_fit_params
ctb_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))
```

```python
class HPOpt(object):

    def __init__(self, x_train, x_test, y_train, y_test):
        self.x_train = x_train
        self.x_test  = x_test
        self.y_train = y_train
        self.y_test  = y_test
        self.best_params = {}
        self.loss = {}

    def process(self, fn_name, space, trials, algo, max_evals):
        fn = getattr(self, fn_name)
        try:
            result = fmin(fn=fn, space=space, algo=algo, max_evals=max_evals, trials=trials)
        except Exception as e:
            return {'status': STATUS_FAIL,
                    'exception': str(e)}
        return result, trials

    def xgb_reg(self, para):
        reg = xgboost.XGBRegressor(**para['reg_params'], predictor="gpu_predictor", tree_method = 'gpu_hist')
        return self.train_reg(reg, para, 'xgb')

    def lgb_reg(self, para):
        reg = lightgbm.LGBMRegressor(**para['reg_params'], device_type="gpu")
        return self.train_reg(reg, para, 'lgb')

    def ctb_reg(self, para):
        reg = catboost.CatBoostRegressor(**para['reg_params'])
        return self.train_reg(reg, para, 'ctb')

    def train_reg(self, reg, para, model):
        reg.fit(self.x_train, self.y_train,
                eval_set=[(self.x_train, self.y_train), (self.x_test, self.y_test)],
                **para['fit_params'])
        pred = reg.predict(self.x_test)
        loss = para['loss_func'](self.y_test, pred)
        if model not in self.loss:
            self.loss[model] = loss
            self.best_params[model] = para['reg_params']
        elif self.loss[model] > loss:
            self.loss[model] = loss
            self.best_params[model] = para['reg_params']
        return {'loss': loss, 'status': STATUS_OK}
```

```python
obj = HPOpt(X_train, X_valid, y_train, y_valid)

xgb_opt = obj.process(fn_name='xgb_reg', space=xgb_para, trials=Trials(), algo=tpe.suggest, max_evals=100)
lgb_opt = obj.process(fn_name='lgb_reg', space=lgb_para, trials=Trials(), algo=tpe.suggest, max_evals=100)
```

```javascript
obj.best_params
{  
    'xgb': {
        'colsample_bytree': 0.6,
        'learning_rate': 0.01,
        'max_depth': 10,
        'min_child_weight': 4,
        'n_estimators': 1500,
        'reg_alpha': 5.0,
        'reg_lambda': 50.0,
        'subsample': 0.850
    },
    'lgb': {'colsample_bytree': 0.7,
        'learning_rate': 0.05,
        'max_depth': 8,
        'min_child_weight': 2,
        'n_estimators': 2000,
        'reg_alpha': 0.1,
        'reg_lambda': 100.0,
        'subsample': 0.875
     },
    'ctb': {  
        'max_depth': 5,   
        'iterations': 5000,   
        'learning_rate': 0.1,  
        'eval_metric': 'RMSE'   
    }  
}  
```

In [None]:
best_params = {  
    'xgb': {
        'colsample_bytree': 0.6,
        'learning_rate': 0.01,
        'max_depth': 10,
        'min_child_weight': 4,
        'n_estimators': 1500,
        'reg_alpha': 5.0,
        'reg_lambda': 50.0,
        'subsample': 0.850
    },
    'lgb': {'colsample_bytree': 0.7,
        'learning_rate': 0.05,
        'max_depth': 8,
        'min_child_weight': 2,
        'n_estimators': 2000,
        'reg_alpha': 0.1,
        'reg_lambda': 100.0,
        'subsample': 0.875
     },
    'ctb': {  
        'max_depth': 5,   
        'iterations': 10000,   
        'learning_rate': 0.01,  
        'eval_metric': 'RMSE'   
    }  
}  

In [None]:
model_xgb = xgboost.XGBRegressor(**best_params['xgb'], predictor="gpu_predictor", tree_method = 'gpu_hist')
model_xgb.fit(X_train_final, y_train)
pred_xgb = model_xgb.predict(X_test_final)

model_lgb = lightgbm.LGBMRegressor(**best_params['lgb'], device_type="gpu")
model_lgb.fit(X_train_final, y_train)
pred_lgb = model_lgb.predict(X_test_final)

In [None]:
model_ctb = catboost.CatBoostRegressor(
    **best_params['ctb'],
    task_type="GPU", devices='0:1', verbose = 1)
model_ctb.fit(X_train_final, y_train, early_stopping_rounds = 100, verbose = 0)
pred_ctb = model_ctb.predict(X_test_final)

## Predictions from each model

In [None]:
pred = {'xgb':pred_xgb, 'lgb': pred_lgb, 'ctb': pred_ctb}
pred_df = pd.DataFrame(pred)

## Creating submissions

In [None]:
submit_map = [x for x in zip(['final_submission','submission_lgb+xgb','submission_lgb','submission_xgb','submission_ctb'],[pred_df.aggregate(np.mean, axis = 1),pred_df.loc[:,['xgb','lgb']].aggregate(np.mean, axis = 1),pred_df.lgb,pred_df.xgb,pred_df.ctb])]
submissions = {}
Id = X_test.index
for (f_name,target) in submit_map:
    submission = pd.DataFrame({
        'id':Id,
        'loss':target
    })
    submissions[f_name] = submission
    

## Saving submissions

In [None]:
for key,value in submissions.items():
    print(f'{key}\n{value.shape}\t{list(value.columns)}')
    value.to_csv(f'{key}', index = False)