In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install python-vivid

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

from vivid.estimators.base import MetaBlock
from vivid.estimators.boosting.mixins import BoostingEarlyStoppingMixin

from catboost import CatBoostRegressor

In [None]:
from vivid.features.base import BinningCountBlock
from vivid.features.base import CountEncodingBlock
from vivid.features.base import FilterBlock

from vivid.estimators.boosting import XGBRegressorBlock
from vivid.estimators.boosting import LGBMRegressorBlock
from vivid.estimators.boosting.block import create_boosting_seed_blocks

from vivid.estimators.linear import TunedRidgeBlock
from vivid.estimators.svm import SVRBlock
from vivid.estimators.ensumble import RFRegressorBlock
from vivid.estimators.base import EnsembleBlock, BaseBlock

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/test.csv')
sample_submission_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/sample_submission.csv')

feature_columns = [
 'cont1',
 'cont2',
 'cont3',
 'cont4',
 'cont5',
 'cont6',
 'cont7',
 'cont8',
 'cont9',
 'cont10',
 'cont11',
 'cont12',
 'cont13',
 'cont14',    
]

y = train_df['target'].values

In [None]:
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture

from vivid.core import BaseBlock


class PCABlock(BaseBlock):
    def __init__(self, n_components=3, columns=None, *args, **kwrgs):
        self.n_components = n_components
        
        if columns is None: columns = feature_columns
        self.columns = columns
        super().__init__(name='pca_n={}'.format(n_components), *args, **kwrgs)
    
    def fit(self, source_df, y, experiment=None) -> pd.DataFrame:
        clf = PCA(n_components=self.n_components)
        clf.fit(source_df[self.columns].values)
        self.clf_ = clf
        return self.transform(source_df)
    
    def transform(self, source_df):
        z = self.clf_.transform(source_df[self.columns])
        out_df = pd.DataFrame(z)
        return out_df.add_prefix('PCA_')
    

class GaussianMixtureBlock(BaseBlock):
    def __init__(self, n_components=3, columns=None, *args, **kwrgs):
        self.n_components = n_components
        
        if columns is None: columns = feature_columns
        self.columns = columns
        super().__init__(name='GMM_n={}'.format(n_components), *args, **kwrgs)
    
    def fit(self, source_df, y, experiment=None) -> pd.DataFrame:
        clf = GaussianMixture(n_components=self.n_components)
        clf.fit(source_df[self.columns].values)
        self.clf_ = clf
        return self.transform(source_df)
    
    def transform(self, source_df):
        z = self.clf_.predict_proba(source_df[self.columns])
        z = np.clip(z, 1e-6,1 - 1e-6)
        out_df = pd.DataFrame(z)
        return out_df.add_prefix('GMM_')

In [None]:
from vivid.estimators.boosting.mixins import TunedBoostingBlock
from vivid.estimators.boosting.helpers import get_boosting_parameter_suggestions
from vivid.estimators.boosting.lgbm import LGBMRegressorBlock
import lightgbm as lgbm
from vivid.runner import create_runner

In [None]:
class TunedLightGBMRegressorBlock(TunedBoostingBlock):
    model_class = lgbm.LGBMRegressor
    default_eval_metric = 'rmse'
    initial_params = LGBMRegressorBlock.initial_params
    
    def generate_model_class_try_params(self, trial):
        param = get_boosting_parameter_suggestions(trial)
        param['n_jobs'] = -1
        return param

In [None]:
feature_blocks = [
    BinningCountBlock(name='BINS', column=feature_columns),
    CountEncodingBlock(name='CE', column=feature_columns),
    FilterBlock(name='F', column=feature_columns),
    PCABlock(n_components=3),
    GaussianMixtureBlock(n_components=3)
]


runner = create_runner(blocks=[
    # normal lightGBM
    LGBMRegressorBlock(name='normal_lgbm', parent=feature_blocks),
    
    # tuned by optuna. 50 rounds.
    TunedLightGBMRegressorBlock(name='tuned_lgbm', parent=feature_blocks, n_trials=50)
])

### Run Tuning 

* only fit

In [None]:
oof_results = runner.fit(train_df[feature_columns], y=y)

In [None]:
# predict
test_results = runner.predict(test_df)

In [None]:
# create out-of-fold overview
oof_df = pd.DataFrame()

for result in oof_results:
    oof_df[result.block.name] = result.out_df.values[:, 0]

### Visualize Models

* Model Output Correation
* OOf Distribution
* sort by RMSE

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(oof_df.corr(), cmap='Blues', annot=True, fmt='.2f', ax=ax)
ax.set_title('Out of Fold Correlation')

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

for result in oof_results:
    sns.distplot(result.out_df.values[:, 0], ax=ax, label=str(result.block.name))

ax.legend()
fig.tight_layout()

In [None]:
from vivid.metrics import regression_metrics

score_df = pd.DataFrame()

for name, pred in oof_df.T.iterrows():
    score_i = regression_metrics(y, pred)
    score_df = score_df.append(pd.Series(score_i, name=name))

In [None]:
score_df.sort_values('rmse')

In [None]:
sample_submission_df

In [None]:
for result in test_results:
    out_df = result.out_df
    
    sub_df = sample_submission_df.copy()
    sub_df['target'] = result.out_df.values[:, 0]
    to = f'/kaggle/working/{str(result.block.name)}_submission.csv'
    print('save to {}'.format(to))
    sub_df.to_csv(to, index=False)