In [None]:
# Librairies
import pandas as pd
import numpy as np

from lightgbm import LGBMRegressor

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [None]:
# Read data
train = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv', index_col='id')
test = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv', index_col='id')

In [None]:
# Predictors & Target
predictors = train.columns[:-1]
target = train.columns[-1]

In [None]:
# Processing
cat_cols = [col for col in train.columns if 'cat' in col]
train[cat_cols] = train[cat_cols].astype('category')
test[cat_cols] = test[cat_cols].astype('category')

In [None]:
# Bagging LGBM
# Didn't use sklearn BaggingRegressor, I wanted more in-depth on the model
class BaggingLGBM():
    """Estimator using bagged LGBM etimators."""
    
    def __init__(self, n_estimators=5, max_sample=0.8, params=None):
        params = params if params else dict()
        self.estimators = [LGBMRegressor(**params) for _ in range(n_estimators)]
        self.max_sample = max_sample
    
    
    def fit(self, X, y):
        scores = []
        
        for estimator in self.estimators:
            # Create bag
            idx = X.sample(frac=self.max_sample, replace=True).index
            X_train, y_train = X.loc[idx], y.loc[idx]
            X_test, y_test = X[~X.index.isin(idx)], y[~y.index.isin(idx)]
            
            # Fit model
            estimator.fit(X_train, 
                          y_train,
                          eval_set =(X_test, 
                                     y_test),
                          eval_metric='rmse',
                          early_stopping_rounds=500,
                          verbose=False
                         )
            
            # Test model
            y_pred = estimator.predict(X_test)
            rmse = mean_squared_error(y_test, y_pred, squared=False)
            scores.append(rmse)
            
        return scores
            
            
    def predict(self, X):
        pred = np.zeros(X.shape[0])
        
        for estimator in self.estimators:
            pred += estimator.predict(X) / len(self.estimators)
            
        return pred

In [None]:
# Test
params = {
     'reg_alpha': 6.147694913504962,
     'reg_lambda': 0.002457826062076097,
     'colsample_bytree': 0.3,
     'subsample': 0.8,
     'learning_rate': 0.008,
     'max_depth': 20,
     'num_leaves': 111,
     'min_child_samples': 285,
     'random_state': 48,
     'n_estimators': 20000,
     'metric': 'rmse',
     'cat_smooth': 39
}


model = BaggingLGBM(n_estimators=15, max_sample=0.90, params=params)
scores = model.fit(train[predictors], train[target])

np.mean(scores)

In [None]:
# Submission
test[target] = model.predict(test[predictors])
test[target].to_csv('submission.csv')