In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install python-vivid

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

from vivid.estimators.base import MetaBlock
from vivid.estimators.boosting.mixins import BoostingEarlyStoppingMixin

from catboost import CatBoostRegressor

In [None]:
from vivid.features.base import BinningCountBlock
from vivid.features.base import CountEncodingBlock
from vivid.features.base import FilterBlock

from vivid.estimators.boosting import XGBRegressorBlock
from vivid.estimators.boosting import LGBMRegressorBlock
from vivid.estimators.boosting.block import create_boosting_seed_blocks

from vivid.estimators.linear import TunedRidgeBlock
from vivid.estimators.svm import SVRBlock
from vivid.estimators.ensumble import RFRegressorBlock
from vivid.estimators.base import EnsembleBlock, BaseBlock

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/test.csv')
sample_submission_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/sample_submission.csv')

feature_columns = [
 'cont1',
 'cont2',
 'cont3',
 'cont4',
 'cont5',
 'cont6',
 'cont7',
 'cont8',
 'cont9',
 'cont10',
 'cont11',
 'cont12',
 'cont13',
 'cont14',    
]

y = train_df['target'].values

In [None]:
!pip install ptitprince

In [None]:
!pip freeze | grep seaborn

In [None]:
!pip install -U seaborn

In [None]:
from ptitprince import RainCloud

In [None]:
RainCloud(y, orient='h')

In [None]:
sns.distplot(y)

In [None]:
np.percentile(y, [.5, 1, 10, 20])

In [None]:
sns.distplot(10 ** y)

In [None]:
from sklearn.decomposition import PCA

In [None]:
model = PCA(n_components=2)
model.fit(train_df[feature_columns])

In [None]:
z = model.transform(train_df[feature_columns])

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
ax.scatter(*z.T, c=y, alpha=.1)

In [None]:
from sklearn.mixture import GaussianMixture

In [None]:
clf = GaussianMixture(n_components=3)
x = train_df[feature_columns].values
clf.fit(x)
z = clf.predict_proba(x)

In [None]:
sns.pairplot(pd.DataFrame(np.vstack([z.T, y]).T).sample(400))

In [None]:
from vivid.core import BaseBlock

In [None]:
class PCABlock(BaseBlock):
    def __init__(self, n_components=3, columns=None, *args, **kwrgs):
        self.n_components = n_components
        
        if columns is None: columns = feature_columns
        self.columns = columns
        super().__init__(name='pca_n={}'.format(n_components), *args, **kwrgs)
    
    def fit(self, source_df, y, experiment=None) -> pd.DataFrame:
        clf = PCA(n_components=self.n_components)
        clf.fit(source_df[self.columns].values)
        self.clf_ = clf
        return self.transform(source_df)
    
    def transform(self, source_df):
        z = self.clf_.transform(source_df[self.columns])
        out_df = pd.DataFrame(z)
        return out_df.add_prefix('PCA_')
    

class GaussianMixtureBlock(BaseBlock):
    def __init__(self, n_components=3, columns=None, *args, **kwrgs):
        self.n_components = n_components
        
        if columns is None: columns = feature_columns
        self.columns = columns
        super().__init__(name='GMM_n={}'.format(n_components), *args, **kwrgs)
    
    def fit(self, source_df, y, experiment=None) -> pd.DataFrame:
        clf = GaussianMixture(n_components=self.n_components)
        clf.fit(source_df[self.columns].values)
        self.clf_ = clf
        return self.transform(source_df)
    
    def transform(self, source_df):
        z = self.clf_.predict_proba(source_df[self.columns])
        z = np.clip(z, 1e-6,1 - 1e-6)
        out_df = pd.DataFrame(z)
        return out_df.add_prefix('GMM_')

In [None]:
PCABlock().fit(train_df, y)

In [None]:
GaussianMixtureBlock().fit(train_df, y)

In [None]:
from vivid.estimators.boosting import LGBMRegressorBlock
from vivid.runner import create_runner

In [None]:
class CatRegressorBlock(BoostingEarlyStoppingMixin, MetaBlock):
    """cat-boost regressor blocks. """
    
    # use cat-boost regressor 
    model_class = CatBoostRegressor
    
    fit_verbose = 100
    early_stopping_rounds = 200
    
    # pass to __init__ 
    initial_params = {
        'learning_rate': .05,
        'verbose': 100,
        'num_boost_round': 20000
    }
    
    def get_fit_params_on_each_fold(self, *args, **kwrgs):
        """create parameters pass to `model.fit` method.
        > see: https://github.com/nyk510/vivid/blob/master/vivid/estimators/boosting/mixins.py#L15
        """
        params = super().get_fit_params_on_each_fold(*args, **kwrgs)
        params['verbose'] = self.fit_verbose
        
        # delete keys for lightGBM / XGBoost.
        remove_keys = [
            'eval_metric',
            'callbacks'
        ]
        
        for k in remove_keys:
            if k in params:
                del params[k]
        return params

In [None]:
feature_blocks = [
    BinningCountBlock(name='BINS', column=feature_columns),
    CountEncodingBlock(name='CE', column=feature_columns),
    FilterBlock(name='F', column=feature_columns),
    PCABlock(n_components=3),
    GaussianMixtureBlock(n_components=3)
]


runner = create_runner(blocks=[
    CatRegressorBlock(name='cat', parent=feature_blocks)
])

In [None]:
y_cutted = np.clip(y, 5, np.inf)

In [None]:
oof_results = runner.fit(train_df[feature_columns], y=y_cutted)

In [None]:
ls -lat

* cont13 / 3 / 4 is very importance feature. 
* ground truth vs out-of-fold predict is not similar ;(
    * y is long tail dist but the objective and metric is `RMSE`, it assumes the noise is normal distribution.

### Modeling

create more complex models

In [None]:
class FillnaBlock(BaseBlock):
    def fit(self, source_df, y, experiment) -> pd.DataFrame:
        self.fill_values_ = source_df.dropna().median()
        return self.transform(source_df)
    
    def transform(self, source_df):
        return source_df.fillna(self.fill_values_)

In [None]:
feature_blocks = [
    BinningCountBlock(name='BINS', column=feature_columns),
    CountEncodingBlock(name='CE', column=feature_columns),
    FilterBlock(name='F', column=feature_columns),
    PCABlock(n_components=3),
    GaussianMixtureBlock(n_components=3)
]


filled_feature_block = FillnaBlock(name='FNA', parent=feature_blocks)

In [None]:
single_models = [
    create_boosting_seed_blocks(feature_class=XGBRegressorBlock, 
                                prefix='xgb_', 
                                parent=feature_blocks),
    create_boosting_seed_blocks(feature_class=LGBMRegressorBlock, 
                                prefix='lgb_', 
                                parent=feature_blocks),
    CatRegressorBlock(name='cat', parent=feature_blocks),
    RFRegressorBlock(name='rf', parent=filled_feature_block),
    TunedRidgeBlock(name='ridge', 
                    add_init_param={ 'target_scaling': 'standard' },
                    n_trials=30, 
                    parent=filled_feature_block)
]

stacking_models = [
    EnsembleBlock(prefix='ens', parent=single_models),
    TunedRidgeBlock(name='stacking_ridge', n_trials=30, parent=single_models, 
                    add_init_param={ 'target_scaling': 'standard' }),
    LGBMRegressorBlock(name='stacked_lgb', parent=[*single_models, *feature_blocks]),
    CatRegressorBlock(name='stacked_cat', parent=[*single_models, *feature_blocks]),
    XGBRegressorBlock(name='stacked_xgb', parent=[*single_models, *feature_blocks])
]

two_stage_stacking_models = [
    TunedRidgeBlock(name='stage-2_ridge', n_trials=30, parent=stacking_models, 
                    add_init_param={ 'target_scaling': 'standard' },)
]

In [None]:
from vivid.backends.experiments import LocalExperimentBackend

runner = create_runner(two_stage_stacking_models, 
                       experiment=LocalExperimentBackend(to='/kaggle/working/'))

In [None]:
# train models
oof_results = runner.fit(train_df, y=y_cutted)

In [None]:
# predict
test_results = runner.predict(test_df)

In [None]:
# create out-of-fold overview
oof_df = pd.DataFrame()

for result in oof_results:
    oof_df[result.block.name] = result.out_df.values[:, 0]

### Visualize Models

* Model Output Correation
* OOf Distribution
* sort by RMSE

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(oof_df.corr(), cmap='Blues', annot=True, fmt='.2f', ax=ax)
ax.set_title('Out of Fold Correlation')

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

for result in oof_results:
    sns.distplot(result.out_df.values[:, 0], ax=ax, label=str(result.block.name))

ax.legend()
fig.tight_layout()

In [None]:
from vivid.metrics import regression_metrics

score_df = pd.DataFrame()

for name, pred in oof_df.T.iterrows():
    score_i = regression_metrics(y, pred)
    score_df = score_df.append(pd.Series(score_i, name=name))

In [None]:
score_df.sort_values('rmse')

In [None]:
sample_submission_df

In [None]:
for result in test_results:
    out_df = result.out_df
    
    sub_df = sample_submission_df.copy()
    sub_df['target'] = result.out_df.values[:, 0]
    to = f'/kaggle/working/{str(result.block.name)}_submission.csv'
    print('save to {}'.format(to))
    sub_df.to_csv(to, index=False)