This notebook is a copy of [Tabular Playground Series Jan 2021 - Models](https://www.kaggle.com/gunesevitan/tabular-playground-series-jan-2021-models) by [@gunesevitan](https://www.kaggle.com/gunesevitan) adapted for [February Tabular Playground Series](https://www.kaggle.com/c/tabular-playground-series-feb-2021). Will update with Neural Networks Section in the future

In [None]:
import warnings
warnings.filterwarnings('ignore')
import random
import os
from tqdm import tqdm

Installing XGBoost 1.3+ which provides gpu speedup

# Setting up Libraries

***Installing XGBoost 1.3 version***

In [None]:
!pip install --upgrade xgboost

***Re-compile LGBM with GPU support***

https://www.kaggle.com/dromosys/gpu-accelerated-lightgbm-full/notebook

In [None]:
!rm -r /opt/conda/lib/python3.6/site-packages/lightgbm
!git clone --recursive https://github.com/Microsoft/LightGBM

In [None]:
!apt-get install -y -qq libboost-all-dev

In [None]:
%%bash
cd LightGBM
rm -r build
mkdir build
cd build
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ ..
make -j$(nproc)

In [None]:
!cd LightGBM/python-package/;python3 setup.py install --precompile

## Tabular Playground Series - Jan 2021

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import probplot, kurtosis, skew, gmean
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import RandomTreesEmbedding

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')

In [None]:
continuous_features = [feature for feature in df_train.columns if feature.startswith('cont')]
categorical_features = [feature for feature in df_train.columns if feature.startswith('cat')]
target = 'target'

In [None]:
print(f'Training Set Shape = {df_train.shape}')
print(f'Training Set Memory Usage = {df_train.memory_usage().sum() / 1024 ** 2:.2f} MB')
print(f'Test Set Shape = {df_test.shape}')
print(f'Test Set Memory Usage = {df_test.memory_usage().sum() / 1024 ** 2:.2f} MB')

## 1. Target

`target` is the name of target feature. It follows an extremely left tailed bimodal distribution. Target mean and median are very close to each other because there are very few outliers which can be seen from the probability plot. Those two outliers are 0 and 3.7, and they should be dealt with.

Bimodal distribution can be break into two components with gaussian mixture model, but it is not possible to predict components of test set.

In [None]:
def plot_target(target):
    
    print(f'Target feature {target} Statistical Analysis\n{"-" * 42}')
        
    print(f'Mean: {df_train[target].mean():.4}  -  Median: {df_train[target].median():.4}  -  Std: {df_train[target].std():.4}')
    print(f'Min: {df_train[target].min():.4}  -  25%: {df_train[target].quantile(0.25):.4}  -  50%: {df_train[target].quantile(0.5):.4}  -  75%: {df_train[target].quantile(0.75):.4}  -  Max: {df_train[target].max():.4}')
    print(f'Skew: {df_train[target].skew():.4}  -  Kurtosis: {df_train[target].kurtosis():.4}')
    missing_values_count = df_train[df_train[target].isnull()].shape[0]
    training_samples_count = df_train.shape[0]
    print(f'Missing Values: {missing_values_count}/{training_samples_count} ({missing_values_count * 100 / training_samples_count:.4}%)')

    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(24, 12), dpi=100)

    sns.distplot(df_train[target], label=target, ax=axes[0][0])
    axes[0][0].axvline(df_train[target].mean(), label='Target Mean', color='r', linewidth=2, linestyle='--')
    axes[0][0].axvline(df_train[target].median(), label='Target Median', color='b', linewidth=2, linestyle='--')
    probplot(df_train[target], plot=axes[0][1])
    
    gmm = GaussianMixture(n_components=2, random_state=42)
    gmm.fit(df_train[target].values.reshape(-1, 1))
    df_train[f'{target}_class'] = gmm.predict(df_train[target].values.reshape(-1, 1))
    
    sns.distplot(df_train[target], label=target, ax=axes[1][0])
    sns.distplot(df_train[df_train[f'{target}_class'] == 0][target], label='Component 1', ax=axes[1][1])
    sns.distplot(df_train[df_train[f'{target}_class'] == 1][target], label='Component 2', ax=axes[1][1])
    
    axes[0][0].legend(prop={'size': 15})
    axes[1][1].legend(prop={'size': 15})
    
    for i in range(2):
        for j in range(2):
            axes[i][j].tick_params(axis='x', labelsize=12)
            axes[i][j].tick_params(axis='y', labelsize=12)
            axes[i][j].set_xlabel('')
            axes[i][j].set_ylabel('')
    axes[0][0].set_title(f'{target} Distribution in Training Set', fontsize=15, pad=12)
    axes[0][1].set_title(f'{target} Probability Plot', fontsize=15, pad=12)
    axes[1][0].set_title(f'{target} Distribution Before GMM', fontsize=15, pad=12)
    axes[1][1].set_title(f'{target} Distribution After GMM', fontsize=15, pad=12)
    plt.show()
    

plot_target(target)

## 2. Features

### 2.1 Continous Features
There are 14 continuous features that are named from `cont1` to `cont14`. All the continous features have multimodal distributions which means they have multiple peaks. Number of peaks changes from feature to feature. None of the features have any missing values and their distributions are very similar in training and test sets.

In [None]:
def plot_continuous(continuous_feature):
            
    print(f'Continuous feature {continuous_feature} Statistical Analysis\n{"-" * 42}')

    print(f'Training Mean: {float(df_train[continuous_feature].mean()):.4}  - Training Median: {float(df_train[continuous_feature].median()):.4} - Training Std: {float(df_train[continuous_feature].std()):.4}')
    print(f'Test Mean: {float(df_test[continuous_feature].mean()):.4}  - Test Median: {float(df_test[continuous_feature].median()):.4} - Test Std: {float(df_test[continuous_feature].std()):.4}')
    print(f'Training Min: {float(df_train[continuous_feature].min()):.4}  - Training Max: {float(df_train[continuous_feature].max()):.4}')
    print(f'Test Min: {float(df_test[continuous_feature].min()):.4}  - Training Max: {float(df_test[continuous_feature].max()):.4}')
    print(f'Training Skew: {float(df_train[continuous_feature].skew()):.4}  - Training Kurtosis: {float(df_train[continuous_feature].kurtosis()):.4}')
    print(f'Test Skew: {float(df_test[continuous_feature].skew()):.4}  - Test Kurtosis: {float(df_test[continuous_feature].kurtosis()):.4}')
    training_missing_values_count = df_train[df_train[continuous_feature].isnull()].shape[0]
    test_missing_values_count = df_test[df_test[continuous_feature].isnull()].shape[0]
    training_samples_count = df_train.shape[0]
    test_samples_count = df_test.shape[0]
    print(f'Training Missing Values: {training_missing_values_count}/{training_samples_count} ({training_missing_values_count * 100 / training_samples_count:.4}%)')
    print(f'Test Missing Values: {test_missing_values_count}/{test_samples_count} ({test_missing_values_count * 100 / test_samples_count:.4}%)')

    fig, axes = plt.subplots(ncols=2, figsize=(24, 6), dpi=100, constrained_layout=True)
    title_size = 18
    label_size = 18

    # Continuous Feature Training and Test Set Distribution
    sns.distplot(df_train[continuous_feature], label='Training', ax=axes[0])
    sns.distplot(df_test[continuous_feature], label='Test', ax=axes[0])
    axes[0].set_xlabel('')
    axes[0].tick_params(axis='x', labelsize=label_size)
    axes[0].tick_params(axis='y', labelsize=label_size)
    axes[0].legend()
    axes[0].set_title(f'{continuous_feature} Distribution in Training and Test Set', size=title_size, pad=title_size)
    
    # Continuous Feature vs target
    sns.scatterplot(df_train[continuous_feature], df_train[target], ax=axes[1])
    axes[1].set_title(f'{continuous_feature} vs {target}', size=title_size, pad=title_size)
    axes[1].set_xlabel('')
    axes[1].set_ylabel('')
    axes[1].tick_params(axis='x', labelsize=label_size)
    axes[1].tick_params(axis='y', labelsize=label_size)
    
    plt.show()
    
    
for continuous_feature in sorted(continuous_features, key=lambda x: int(x.split('cont')[-1])):
    plot_continuous(continuous_feature)

### 2.2 Categorical Features
There are 10 continuous features that are named from `cat1` to `cat10`. `cat0`, `cat1` & `cat2` are binary in nature, rest all have multiple categories

In [None]:
all_data = df_train[categorical_features+continuous_features].append(df_test[categorical_features+continuous_features])
all_data["set"] = ["Train"]*len(df_train)+["Test"]*len(df_test)

In [None]:
def plot_categorical(categorical_feature):
    print(f'Categorical feature {categorical_feature} Statistical Analysis\n{"-" * 42}')
    training_missing_values_count = df_train[df_train[categorical_feature].isnull()].shape[0]
    test_missing_values_count = df_test[df_test[categorical_feature].isnull()].shape[0]
    training_samples_count = df_train.shape[0]
    test_samples_count = df_test.shape[0]
    print(f'Training Missing Values: {training_missing_values_count}/{training_samples_count} ({training_missing_values_count * 100 / training_samples_count:.4}%)')
    print(f'Test Missing Values: {test_missing_values_count}/{test_samples_count} ({test_missing_values_count * 100 / test_samples_count:.4}%)')

    title_size = 18
    label_size = 18
    
    # Continuous Feature Training and Test Set Distribution
    sns.countplot(all_data[categorical_feature], hue=all_data['set'])
    plt.xlabel(categorical_feature)
    plt.ylabel("count")
    plt.legend()
    plt.title(f'{categorical_feature}', size=title_size, pad=title_size)
    plt.show()
    
for categorical_feature in sorted(categorical_features, key=lambda x: int(x.split('cat')[-1])):
    plot_categorical(categorical_feature)

## 3. Preprocessing

`Preprocessor` class incorporates the preprocessing steps such as cross-validation folds creation, outlier removal, standardization and feature engineering.

* Created 5 random split folds for cross-validation
* Continuous features are standardized for linear and neural network models, but it is not necessary for tree-based models
* `argmax` and `argmin` features are created for continuous features
* Continuous features are discretized with distribution components extracted by a gaussian mixture model 

In [None]:
class Preprocessor:
    
    def __init__(self, train, test, n_splits, shuffle, random_state, scaler, discretize_features, create_features, encoder):
        self.train = train.copy(deep=True)        
        self.test = test.copy(deep=True)   
        
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state
        self.scaler = scaler() if scaler else None
        self.discretize_features = discretize_features
        self.create_features = create_features
        self.encoder = encoder
        
    def create_folds(self):    
        kf = KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state)
        for fold, (_, val_idx) in enumerate(kf.split(self.train), 1):
            self.train.loc[val_idx, 'fold'] = fold
        self.train['fold'] = self.train['fold'].astype(np.uint8)

    def scale(self):
        df_all = pd.concat([self.train[continuous_features], self.test[continuous_features]], ignore_index=True, axis=0)
        self.scaler.fit(df_all[continuous_features])
        self.train.loc[:, continuous_features] = self.scaler.transform(self.train.loc[:, continuous_features].values)
        self.test.loc[:, continuous_features] = self.scaler.transform(self.test.loc[:, continuous_features].values)
        
        print(f'Scaled {len(continuous_features)} features with {self.scaler.__class__.__name__}')
        del df_all
        
    def create_idx_features(self):
        for df in [self.train, self.test]:
            df['cont_argmin'] = np.argmin(df[continuous_features].values, axis=1)
            df['cont_argmax'] = np.argmax(df[continuous_features].values, axis=1)
            
        idx_features = ['cont_argmin', 'cont_argmax']
        print(f'Created {len(idx_features)} idx features with argmin and argmax')
    
    def encode(self):
        for e in categorical_features:
            le = LabelEncoder()
            self.train[e] = le.fit_transform(self.train[e])
            self.test[e] = le.transform(self.test[e])
    
    def create_gmm_features(self):
        n_component_mapping = {
            0: 4,
            1: 8,
            2: 5,
            3: 4,
            4: 3,
            5: 5,
            6: 3,
            7: 4,
            8: 5,
            9: 4,
            10: 5,
            11: 2,
            12: 3,
            13: 7
        }
        
        for i in range(14):
            gmm = GaussianMixture(n_components=n_component_mapping[i], random_state=self.random_state)            
            gmm.fit(pd.concat([self.train[f'cont{i}'], self.test[f'cont{i}']], axis=0).values.reshape(-1, 1))
            
            self.train[f'cont{i}_class'] = gmm.predict(self.train[f'cont{i}'].values.reshape(-1, 1))
            self.test[f'cont{i}_class'] = gmm.predict(self.test[f'cont{i}'].values.reshape(-1, 1))
            
        gmm_features = [f'cont{i}_class' for i in range(1, 15)]
        print(f'Created {len(gmm_features)} gmm features with GaussianMixture')
                
    def transform(self):
        self.create_folds()
        if self.create_features:
            self.create_idx_features()
        if self.discretize_features:
            self.create_gmm_features()
        if self.scaler:
            self.scale()
        if self.encoder:
            self.encode()

        return self.train.copy(deep=True), self.test.copy(deep=True)


This is the example usage of preprocessing pipeline. It returns processed training and test sets ready for model training, but those dataframes created in this cell are only used for storing model predictions. Models are trained with their custom datasets that designed specifically for them.

In [None]:
cross_validation_seed = 0
preprocessor = Preprocessor(train=df_train, test=df_test,
                            n_splits=5, shuffle=True, random_state=cross_validation_seed,
                            scaler=None,
                            create_features=True, discretize_features=True, encoder=True)
df_train_processed, df_test_processed = preprocessor.transform()

print(f'\nPreprocessed Training Set Shape = {df_train_processed.shape}')
print(f'Preprocessed Training Set Memory Usage = {df_train_processed.memory_usage().sum() / 1024 ** 2:.2f} MB')
print(f'Preprocessed Test Set Shape = {df_test_processed.shape}')
print(f'Preprocessed Test Set Memory Usage = {df_test_processed.memory_usage().sum() / 1024 ** 2:.2f} MB\n')

## 4. Tree-based Models

4 tree-based models are used in the ensemble and they are LightGBM, CatBoost, XGBoost, and Random Forest. All of them are trained with more than 3 seeds for diversity. Only raw continuous features are used in tree-based models without any transformation. Even though the training logs are not present in this version of the notebook, those are the parameters used for getting the scores displayed below each cell.

In [None]:
class TreeModels:
    
    def __init__(self, predictors, target, model, model_parameters, boosting_rounds, early_stopping_rounds, seeds):
        
        self.predictors = predictors
        self.target = target
               
        self.model = model
        self.model_parameters = model_parameters
        self.boosting_rounds = boosting_rounds
        self.early_stopping_rounds = early_stopping_rounds
        self.seeds = seeds
                
    def _train_and_predict_lgb(self, X_train, y_train, X_test):
        
        seed_avg_oof_predictions = np.zeros(X_train.shape[0])
        seed_avg_test_predictions = np.zeros(X_test.shape[0])        
        seed_avg_importance = pd.DataFrame(data=np.zeros(len(self.predictors)), index=self.predictors, columns=['Importance'])
        
        for seed in self.seeds:
            print(f'{"-" * 30}\nRunning LightGBM model with seed: {seed}\n{"-" * 30}\n')
            self.model_parameters['seed'] = seed
            self.model_parameters['feature_fraction_seed'] = seed
            self.model_parameters['bagging_seed'] = seed
            self.model_parameters['drop_seed'] = seed
            self.model_parameters['data_random_seed'] = seed
                
            for fold in sorted(X_train['fold'].unique()):

                trn_idx, val_idx = X_train.loc[X_train['fold'] != fold].index, X_train.loc[X_train['fold'] == fold].index
                trn = lgb.Dataset(X_train.loc[trn_idx, self.predictors], label=y_train.loc[trn_idx])
                val = lgb.Dataset(X_train.loc[val_idx, self.predictors], label=y_train.loc[val_idx])

                model = lgb.train(params=self.model_parameters,
                                  train_set=trn,
                                  valid_sets=[trn, val],
                                  num_boost_round=self.boosting_rounds,
                                  early_stopping_rounds=self.early_stopping_rounds,
                                  verbose_eval=500)            

                val_predictions = model.predict(X_train.loc[val_idx, self.predictors])
                seed_avg_oof_predictions[val_idx] += (val_predictions / len(self.seeds))
                test_predictions = model.predict(X_test[self.predictors])
                seed_avg_test_predictions += (test_predictions / X_train['fold'].nunique() / len(self.seeds))
                seed_avg_importance['Importance'] += (model.feature_importance(importance_type='gain') / X_train['fold'].nunique() / len(self.seeds))

                fold_score = mean_squared_error(y_train.loc[val_idx], val_predictions, squared=False)
                print(f'\nLGB Fold {int(fold)} - X_trn: {X_train.loc[trn_idx, self.predictors].shape} X_val: {X_train.loc[val_idx, self.predictors].shape} - Score: {fold_score:.6} - Seed: {seed}\n')
            
        df_train_processed['LGBPredictions'] = seed_avg_oof_predictions
        df_test_processed['LGBPredictions'] = seed_avg_test_predictions
        oof_score = mean_squared_error(y_train, df_train_processed['LGBPredictions'], squared=False)
        print(f'{"-" * 30}\nLGB OOF RMSE: {oof_score:.6} ({len(self.seeds)} Seed Average)\n{"-" * 30}')
                
        self._plot_importance(seed_avg_importance)
        self._plot_predictions(df_train_processed[target], df_train_processed['LGBPredictions'], df_test_processed['LGBPredictions'])
        
    def _train_and_predict_cb(self, X_train, y_train, X_test):
        
        seed_avg_oof_predictions = np.zeros(X_train.shape[0])
        seed_avg_test_predictions = np.zeros(X_test.shape[0])        
        seed_avg_importance = pd.DataFrame(data=np.zeros(len(self.predictors)), index=self.predictors, columns=['Importance'])
            
        for seed in self.seeds:
            print(f'{"-" * 30}\nRunning CatBoost model with seed: {seed}\n{"-" * 30}\n')
            self.model_parameters['random_seed'] = seed
            
            for fold in sorted(X_train['fold'].unique()):

                trn_idx, val_idx = X_train.loc[X_train['fold'] != fold].index, X_train.loc[X_train['fold'] == fold].index
                trn = cb.Pool(X_train.loc[trn_idx, self.predictors], label=y_train.loc[trn_idx])
                val = cb.Pool(X_train.loc[val_idx, self.predictors], label=y_train.loc[val_idx])

                model = cb.CatBoostRegressor(**self.model_parameters)
                model.fit(X=trn, eval_set=val)

                val_predictions = model.predict(val)
                seed_avg_oof_predictions[val_idx] += (val_predictions / len(self.seeds))
                test_predictions = model.predict(cb.Pool(X_test[self.predictors]))
                seed_avg_test_predictions += (test_predictions / X_train['fold'].nunique() / len(self.seeds))
                seed_avg_importance['Importance'] += (model.get_feature_importance() / X_train['fold'].nunique() / len(self.seeds))

                fold_score = mean_squared_error(df_train_processed.loc[val_idx, self.target], val_predictions, squared=False)
                print(f'\nCB Fold {int(fold)} - X_trn: {X_train.loc[trn_idx, self.predictors].shape} X_val: {X_train.loc[val_idx, self.predictors].shape} - Score: {fold_score:.6} - Seed: {seed}\n')
            
        df_train_processed['CBPredictions'] = seed_avg_oof_predictions
        df_test_processed['CBPredictions'] = seed_avg_test_predictions
        oof_score = mean_squared_error(y_train, df_train_processed['CBPredictions'], squared=False)
        print(f'{"-" * 30}\nCB OOF RMSE: {oof_score:.6} ({len(self.seeds)} Seed Average)\n{"-" * 30}')
                
        self._plot_importance(seed_avg_importance)
        self._plot_predictions(df_train_processed[target], df_train_processed['CBPredictions'], df_test_processed['CBPredictions'])
        
    def _train_and_predict_xgb(self, X_train, y_train, X_test):
        
        seed_avg_oof_predictions = np.zeros(X_train.shape[0])
        seed_avg_test_predictions = np.zeros(X_test.shape[0])
        seed_avg_importance = pd.DataFrame(data=np.zeros(len(self.predictors)), index=self.predictors, columns=['Importance'])
        
        for seed in self.seeds:
            print(f'{"-" * 30}\nRunning XGBoost model with seed: {seed}\n{"-" * 30}\n')
            self.model_parameters['seed'] = seed
        
            for fold in sorted(X_train['fold'].unique()):

                trn_idx, val_idx = X_train.loc[X_train['fold'] != fold].index, X_train.loc[X_train['fold'] == fold].index
                trn = xgb.DMatrix(X_train.loc[trn_idx, self.predictors], label=y_train.loc[trn_idx])
                val = xgb.DMatrix(X_train.loc[val_idx, self.predictors], label=y_train.loc[val_idx])

                model = xgb.train(params=self.model_parameters,
                                  dtrain=trn,
                                  evals=[(trn, 'train'), (val, 'val')],
                                  num_boost_round=self.boosting_rounds, 
                                  early_stopping_rounds=self.early_stopping_rounds,
                                  verbose_eval=500) 

                val_predictions = model.predict(xgb.DMatrix(X_train.loc[val_idx, self.predictors]))
                seed_avg_oof_predictions[val_idx] += (val_predictions / len(self.seeds))
                test_predictions = model.predict(xgb.DMatrix(X_test[self.predictors]))
                seed_avg_test_predictions += (test_predictions / X_train['fold'].nunique() / len(self.seeds))
                seed_avg_importance['Importance'] += (np.array(list(model.get_score(importance_type='gain').values())) / X_train['fold'].nunique() / len(self.seeds))

                fold_score = mean_squared_error(df_train_processed.loc[val_idx, self.target], val_predictions, squared=False)
                print(f'\nXGB Fold {int(fold)} - X_trn: {X_train.loc[trn_idx, self.predictors].shape} X_val: {X_train.loc[val_idx, self.predictors].shape} - Score: {fold_score:.6} - Seed: {seed}\n')
            
        df_train_processed['XGBPredictions'] = seed_avg_oof_predictions
        df_test_processed['XGBPredictions'] = seed_avg_test_predictions
        oof_score = mean_squared_error(y_train, df_train_processed['XGBPredictions'], squared=False)
        print(f'{"-" * 30}\nXGB OOF RMSE: {oof_score:.6} ({len(self.seeds)} Seed Average) \n{"-" * 30}')
                
        self._plot_importance(seed_avg_importance)
        self._plot_predictions(df_train_processed[target], df_train_processed['XGBPredictions'], df_test_processed['XGBPredictions'])
        
    def _train_and_predict_rf(self, X_train, y_train, X_test):
        
        seed_avg_oof_predictions = np.zeros(X_train.shape[0])
        seed_avg_test_predictions = np.zeros(X_test.shape[0])
        
        for seed in self.seeds:
            print(f'{"-" * 30}\nRunning RandomForest model with seed: {seed}\n{"-" * 30}\n')
            self.model_parameters['random_state'] = seed
                
            for fold in sorted(X_train['fold'].unique()):

                trn_idx, val_idx = X_train.loc[X_train['fold'] != fold].index, X_train.loc[X_train['fold'] == fold].index
                X_trn, y_trn = X_train.loc[trn_idx, self.predictors].astype(np.float32), y_train.loc[trn_idx].astype(np.float32)
                X_val, y_val = X_train.loc[val_idx, self.predictors].astype(np.float32), y_train.loc[val_idx].astype(np.float32)

                import cuml
                model = cuml.ensemble.RandomForestRegressor(**self.model_parameters)
                model.fit(X_trn, y_trn)

                val_predictions = model.predict(X_val)
                seed_avg_oof_predictions[val_idx] += (val_predictions / len(self.seeds))
                test_predictions = model.predict(X_test[self.predictors])
                seed_avg_test_predictions += (test_predictions / X_train['fold'].nunique() / len(self.seeds))

                fold_score = mean_squared_error(df_train_processed.loc[val_idx, self.target], val_predictions, squared=False)
                print(f'RF Fold {int(fold)} - X_trn: {X_train.loc[trn_idx, self.predictors].shape} X_val: {X_train.loc[val_idx, self.predictors].shape} - Score: {fold_score:.6}')

        df_train_processed['RFPredictions'] = seed_avg_oof_predictions
        df_test_processed['RFPredictions'] = seed_avg_test_predictions
        oof_score = mean_squared_error(y_train, df_train_processed['RFPredictions'], squared=False)
        print(f'{"-" * 30}\nRF OOF RMSE: {oof_score:.6} ({len(self.seeds)} Seed Average) \n{"-" * 30}')
        
        self._plot_predictions(df_train_processed[target], df_train_processed['RFPredictions'], df_test_processed['RFPredictions'])
        
    def _plot_importance(self, df_importance):
        
        df_importance.sort_values(by='Importance', inplace=True, ascending=False)
        
        plt.figure(figsize=(25, 6))       
        sns.barplot(x='Importance', y=df_importance.index, data=df_importance, palette='Blues_d')
        plt.xlabel('')
        plt.tick_params(axis='x', labelsize=20)
        plt.tick_params(axis='y', labelsize=20)
        plt.title(f'{self.model} Feature Importance (Gain)', size=20, pad=20)
        plt.show()
        
    def _plot_predictions(self, train_labels, train_predictions, test_predictions):
        
        fig, axes = plt.subplots(ncols=2, figsize=(25, 6))                                            
        sns.scatterplot(train_labels, train_predictions, ax=axes[0])
        sns.distplot(train_predictions, label='Train Predictions', ax=axes[1])
        sns.distplot(test_predictions, label='Test Predictions', ax=axes[1])

        axes[0].set_xlabel(f'Train Labels', size=18)
        axes[0].set_ylabel(f'Train Predictions', size=18)
        axes[1].set_xlabel('')
        axes[1].legend(prop={'size': 18})
        for i in range(2):
            axes[i].tick_params(axis='x', labelsize=15)
            axes[i].tick_params(axis='y', labelsize=15)
        axes[0].set_title(f'Train Labels vs Train Predictions', size=20, pad=20)
        axes[1].set_title(f'Predictions Distributions', size=20, pad=20)
            
        plt.show() 
        
    def run(self, X_train, y_train, X_test):
        
        if self.model == 'LGB':
            self._train_and_predict_lgb(X_train, y_train, X_test)
        elif self.model == 'CB':
            self._train_and_predict_cb(X_train, y_train, X_test)
        elif self.model == 'XGB':
            self._train_and_predict_xgb(X_train, y_train, X_test)
        elif self.model == 'RF':
            self._train_and_predict_rf(X_train, y_train, X_test)


### 4.1 LightGBM

In [None]:
model = 'LGB'
lgb_preprocessor = Preprocessor(train=df_train, test=df_test,
                            n_splits=5, shuffle=True, random_state=cross_validation_seed, scaler=None,
                            create_features=True, discretize_features=True, encoder=True)
df_train_lgb, df_test_lgb = lgb_preprocessor.transform()

print(f'\n{model} Training Set Shape = {df_train_lgb.shape}')
print(f'{model} Training Set Memory Usage = {df_train_lgb.memory_usage().sum() / 1024 ** 2:.2f} MB')
print(f'{model} Test Set Shape = {df_test_lgb.shape}')
print(f'{model} Test Set Memory Usage = {df_test_lgb.memory_usage().sum() / 1024 ** 2:.2f} MB\n')

X_train_lgb = df_train_lgb.copy(deep=True)
y_train_lgb = df_train_lgb[target].copy(deep=True)
X_test_lgb = df_test_lgb.copy(deep=True)

lgb_parameters = {
'predictors': continuous_features,
'target': target,
'model': model,
'model_parameters': {
    # https://www.kaggle.com/hamzaghanmi/lgbm-hyperparameter-tuning-using-optuna
    'reg_alpha': 6.147694913504962,
    'reg_lambda': 0.002457826062076097,
    'colsample_bytree': 0.3,
    'subsample': 0.8,
    'learning_rate': 0.008,
    'max_depth': 20,
    'num_leaves': 111,
    'min_child_samples': 285,
    'random_state': 48,
    'n_estimators': 20000,
    'metric': 'rmse',
    'cat_smooth': 39,
    'objective': 'regression',
    'seed': None,
    'feature_fraction_seed': None,
    'bagging_seed': None,
    'drop_seed': None,
    'data_random_seed': None,
    'boosting_type': 'gbdt',
    'verbose': 1,
    'metric': 'rmse',
    'n_jobs': -1,
    # gpu specific
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0
},
'boosting_rounds': 20000,
'early_stopping_rounds': 200,
'seeds': [541992, 721991, 1337]
}

lgb_model = TreeModels(**lgb_parameters)
lgb_model.run(X_train_lgb, y_train_lgb, X_test_lgb)

del df_train_lgb, df_test_lgb, X_train_lgb, y_train_lgb, X_test_lgb
del lgb_preprocessor, lgb_parameters, lgb_model

print('Saving LightGBM OOF and Test predictions to current working directory.')
df_train_processed[['id', 'LGBPredictions']].to_csv('lgb_oof_predictions.csv', index=False)
df_test_processed[['id', 'LGBPredictions']].to_csv('lgb_test_predictions.csv', index=False)

### 4.2 CatBoost

In [None]:
model = 'CB'
cb_preprocessor = Preprocessor(train=df_train, test=df_test,
                               n_splits=5, shuffle=True, random_state=cross_validation_seed, scaler=None,
                               create_features=False, discretize_features=False, encoder=True)
df_train_cb, df_test_cb = cb_preprocessor.transform()

print(f'\n{model} Training Set Shape = {df_train_cb.shape}')
print(f'{model} Training Set Memory Usage = {df_train_cb.memory_usage().sum() / 1024 ** 2:.2f} MB')
print(f'{model} Test Set Shape = {df_test_cb.shape}')
print(f'{model} Test Set Memory Usage = {df_test_cb.memory_usage().sum() / 1024 ** 2:.2f} MB\n')

X_train_cb = df_train_cb[continuous_features + ['fold']].copy(deep=True)
y_train_cb = df_train_cb[target].copy(deep=True)
X_test_cb = df_test_cb[continuous_features].copy(deep=True)

# https://www.kaggle.com/ttahara/tps-feb-2021-3gbdts-ensemble-baseline
cb_parameters = {
    'predictors': continuous_features,
    'target': target,
    'model': model,
    'model_parameters': {
        'num_boost_round': 20000,
        'task_type': "GPU",
        'learning_rate': 0.05,
        'depth': 4,
        'subsample': 0.6,
        'use_best_model': True,
        'eval_metric': 'RMSE',
        'loss_function': 'RMSE',   
        'random_seed': None,
        'verbose': 0
    },
    'boosting_rounds': None,
    'early_stopping_rounds': None,
    'seeds': [541992, 721991, 1337, 42, 0]
}

cb_model = TreeModels(**cb_parameters)
cb_model.run(X_train_cb, y_train_cb, X_test_cb)

del df_train_cb, df_test_cb, X_train_cb, y_train_cb, X_test_cb
del cb_preprocessor, cb_parameters, cb_model

print('Saving CatBoost OOF and Test predictions to current working directory.')
df_train_processed[['id', 'CBPredictions']].to_csv('cb_oof_predictions.csv', index=False)
df_test_processed[['id', 'CBPredictions']].to_csv('cb_test_predictions.csv', index=False)
TreeModels._plot_predictions(None, df_train_processed[target], df_train_processed['CBPredictions'], df_test_processed['CBPredictions'])


### 4.3 XGBoost

In [None]:
model = 'XGB'
xgb_preprocessor = Preprocessor(train=df_train, test=df_test,
                                n_splits=5, shuffle=True, random_state=cross_validation_seed, scaler=None,
                                create_features=False, discretize_features=False, encoder=True)
df_train_xgb, df_test_xgb = xgb_preprocessor.transform()

print(f'\n{model} Training Set Shape = {df_train_xgb.shape}')
print(f'{model} Training Set Memory Usage = {df_train_xgb.memory_usage().sum() / 1024 ** 2:.2f} MB')
print(f'{model} Test Set Shape = {df_test_xgb.shape}')
print(f'{model} Test Set Memory Usage = {df_test_xgb.memory_usage().sum() / 1024 ** 2:.2f} MB\n')

X_train_xgb = df_train_xgb[continuous_features + ['fold']].copy(deep=True)
y_train_xgb = df_train_xgb[target].copy(deep=True)
X_test_xgb = df_test_xgb[continuous_features].copy(deep=True)

xgb_parameters = {
    'predictors': continuous_features,
    'target': target,
    'model': model,
    # https://www.kaggle.com/tunguz/tps-02-21-feature-importance-with-xgboost-and-shap
    'model_parameters': {
        'learning_rate': 0.01,
        'colsample_bytree': 0.4, 
        'sumbsample': 0.6,
        'max_depth': 6,
        'min_child_weight': 100,
        'alpha': 6,
        'objective': 'reg:squarederror',
        'seed': None,
        'boosting_type': 'gbtree',
        'tree_method': 'gpu_hist',
        'gpu_id': 0,
        'silent': True,
        'verbose': 1,
        'n_jobs': -1,
    },
    'boosting_rounds': 25000,
    'early_stopping_rounds': 200,
    'seeds': [541992, 721991, 1337]
}

xgb_model = TreeModels(**xgb_parameters)
xgb_model.run(X_train_xgb, y_train_xgb, X_test_xgb)

del df_train_xgb, df_test_xgb, X_train_xgb, y_train_xgb, X_test_xgb
del xgb_preprocessor, xgb_parameters, xgb_model

print('Saving XGBoost OOF and Test predictions to current working directory.')
df_train_processed[['id', 'XGBPredictions']].to_csv('xgb_oof_predictions.csv', index=False)
df_test_processed[['id', 'XGBPredictions']].to_csv('xgb_test_predictions.csv', index=False)

### 4.4 Random Forest

In [None]:
model = 'RF'
rf_preprocessor = Preprocessor(train=df_train, test=df_test,
                               n_splits=5, shuffle=True, random_state=cross_validation_seed, scaler=None,
                               create_features=False, discretize_features=False, encoder=True)
df_train_rf, df_test_rf = rf_preprocessor.transform()

print(f'\n{model} Training Set Shape = {df_train_rf.shape}')
print(f'{model} Training Set Memory Usage = {df_train_rf.memory_usage().sum() / 1024 ** 2:.2f} MB')
print(f'{model} Test Set Shape = {df_test_rf.shape}')
print(f'{model} Test Set Memory Usage = {df_test_rf.memory_usage().sum() / 1024 ** 2:.2f} MB\n')

X_train_rf = df_train_rf[continuous_features + ['fold']].copy(deep=True)
y_train_rf = df_train_rf[target].copy(deep=True)
X_test_rf = df_test_rf[continuous_features].copy(deep=True)

# this is still from the previous competition
rf_parameters = {
    'predictors': continuous_features,
    'target': target,
    'model': model,
    'model_parameters': {
        'n_estimators': 400,
        'split_algo': 0,
        'split_criterion': 2,             
        'bootstrap': True,
        'bootstrap_features': False,
        'max_depth': 13,
        'max_leaves': -1,
        'max_features': 0.5,
        'n_bins': 2 ** 6,
        'random_state': None,
        'verbose': True,
    },
    'boosting_rounds': None,
    'early_stopping_rounds': None,
    'seeds': [541992, 721991, 1337, 42, 0]
}

rf_model = TreeModels(**rf_parameters)
rf_model.run(X_train_rf, y_train_rf, X_test_rf)

del df_train_rf, df_test_rf, X_train_rf, y_train_rf, X_test_rf
del rf_preprocessor, rf_parameters, rf_model

print('Saving RandomForest OOF and Test predictions to current working directory.')
df_train_processed[['id', 'RFPredictions']].to_csv('rf_oof_predictions.csv', index=False)
df_test_processed[['id', 'RFPredictions']].to_csv('rf_test_predictions.csv', index=False)

## 5. Submission

In [None]:
prediction_columns = [col for col in df_train_processed.columns if col.endswith('Predictions')]

fig = plt.figure(figsize=(12, 12), dpi=100)
sns.heatmap(df_train_processed[prediction_columns + [target]].corr(),
            annot=True,
            square=True,
            cmap='coolwarm',
            annot_kws={'size': 15},
            fmt='.4f')

plt.tick_params(axis='x', labelsize=18, rotation=90)
plt.tick_params(axis='y', labelsize=18, rotation=0)
plt.title('Prediction Correlations', size=20, pad=20)

plt.show()

In [None]:
class SubmissionPipeline:
    
    def __init__(self, train, test, blend, prediction_columns, add_public_best):
        
        self.train = train
        self.test = test
        self.blend = blend
        self.prediction_columns = prediction_columns
        self.add_public_best = add_public_best
        
    def weighted_average(self):
        self.train['FinalPredictions'] = (0.25 * self.train['LGBPredictions']) +\
                                         (0.25 * self.train['CBPredictions']) +\
                                         (0.25 * self.train['XGBPredictions']) +\
                                         (0.25 * self.train['RFPredictions'])
        
        self.test['FinalPredictions'] = (0.25 * self.test['LGBPredictions']) +\
                                        (0.25 * self.test['CBPredictions']) +\
                                        (0.25 * self.test['XGBPredictions']) +\
                                        (0.25 * self.test['RFPredictions'])
        
    def geometric_average(self):
        self.train['FinalPredictions'] = gmean(self.train[self.prediction_columns], axis=1)
        self.test['FinalPredictions'] = gmean(self.test[self.prediction_columns], axis=1)
        
    def transform(self):        
        if self.blend == 'weighted_average':
            self.weighted_average()
        elif self.blend == 'geometric_average':
            self.geometric_average()
            
        for prediction_column in prediction_columns:
            oof_score = mean_squared_error(self.train[target], df_train_processed[prediction_column], squared=False)
            print(f'{prediction_column.split("Predictions")[0]} OOF RMSE: {oof_score:.6}')
        final_oof_score = mean_squared_error(self.train[target], df_train_processed['FinalPredictions'], squared=False)
        print(f'{"-" * 30}\nFinal OOF RMSE: {final_oof_score:.6}\n{"-" * 30}')
                
        return self.train[['id'] + self.prediction_columns + ['FinalPredictions']].copy(deep=True), self.test[['id'] + self.prediction_columns + ['FinalPredictions']].copy(deep=True)
            


submission_pipeline = SubmissionPipeline(train=df_train_processed, test=df_test_processed,
                                         blend='weighted_average', prediction_columns=prediction_columns, add_public_best=True)   
df_train_submission, df_test_submission = submission_pipeline.transform()

In [None]:
df_test_processed['target'] = df_test_submission['FinalPredictions']
df_test_processed[['id', 'target']].to_csv('submission.csv', index=False)
df_test_processed[['id', 'target']].describe()

In [None]:
!rm -rf LightGBM