In [None]:
import os
import gc
import sys
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
import datatable as dt
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff

from sklearn.ensemble import RandomForestRegressor

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor
import xgboost as xgb
from catboost import CatBoostRegressor, Pool, CatBoost

from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
sr_ = Style.RESET_ALL

import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.preprocessing import RobustScaler, QuantileTransformer, StandardScaler,PowerTransformer
from sklearn.decomposition import PCA

In [None]:
path = '../input/tabular-playground-series-jan-2021/'
train_data = pd.read_csv(path + 'train.csv')
test_data = pd.read_csv(path + 'test.csv')
sample = pd.read_csv(path + 'sample_submission.csv')

In [None]:
train_data.head()

In [None]:
train_data['cont13_cont4_mul'] = train_data['cont13']*train_data['cont4']
train_data['cont13_cont11_mul'] = train_data['cont13']*train_data['cont11']
train_data['cont13_cont7_mul'] = train_data['cont13']*train_data['cont7']
train_data['cont13_cont2_mul'] = train_data['cont13']*train_data['cont2']
train_data['cont13_cont10_mul'] = train_data['cont13']*train_data['cont10']

test_data['cont13_cont4_mul'] = test_data['cont13']*test_data['cont4']
test_data['cont13_cont11_mul'] = test_data['cont13']*test_data['cont11']
test_data['cont13_cont7_mul'] = test_data['cont13']*test_data['cont7']
test_data['cont13_cont2_mul'] = test_data['cont13']*test_data['cont2']
test_data['cont13_cont10_mul'] = test_data['cont13']*test_data['cont10']

## Preprocessing

In [None]:
## stratified k-fold for regression data
num_bins = int(1 + np.log2(len(train_data)))
train_data.loc[:,'bins'] = pd.cut(train_data['target'].to_numpy(),bins=num_bins,labels=False)

features = [f'cont{x}' for x in range(1,15)]
features += [
    'cont13_cont4_mul',
    'cont13_cont11_mul',
    'cont13_cont7_mul',
    'cont13_cont2_mul',
    'cont13_cont10_mul',
]

target_feature = 'target'

train_data = train_data.query('target >=5')
bins = train_data['bins'].to_numpy()

target = train_data[target_feature].to_numpy()
train_data = train_data[features].to_numpy()
test_data = test_data[features].to_numpy()

scaler = PowerTransformer()
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)

## lgbm model

In [None]:
def rmse_score(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [None]:
seed = 2021
nfolds = 5

params={    
 'objective':'regression',
 'metrics':'rmse',
 'boosting':'gbdt',
 'min_data_per_group': 5,
 'num_leaves': 256,
 'max_depth': -1,
 'learning_rate': 0.005,
 'subsample_for_bin': 200000,
 'lambda_l1': 1.074622455507616e-05,
 'lambda_l2': 2.0521330798729704e-06,
 'n_jobs': -1,
 'cat_smooth': 1.0,
 'silent': True,
 'importance_type': 'gain',
 'feature_pre_filter': False,
 'bagging_fraction': 0.8206341150202605,
 'min_data_in_leaf': 100,
 'min_sum_hessian_in_leaf': 0.001,
 'bagging_freq': 6,
 'feature_fraction': 0.5,
 'min_gain_to_split': 0.0,
 'min_child_samples': 20}

In [None]:
lgbm_preds = np.zeros(test_data.shape[0])

kfold = StratifiedKFold(n_splits=nfolds,random_state=seed)
lgbm_scores = list()
for train_idx, valid_idx in kfold.split(X=train_data,y=bins):
    lgb_train = lgb.Dataset(train_data[train_idx],target[train_idx])
    lgb_valid = lgb.Dataset(train_data[valid_idx],target[valid_idx],reference=lgb_train)
    lgb_model = lgb.train(params,
                      lgb_train, 
                      valid_sets=[lgb_train,lgb_valid],
                      num_boost_round=10000,
                      verbose_eval=200,
                      early_stopping_rounds=100,
                      )
    lgbm_scores.append(rmse_score(target[valid_idx],lgb_model.predict(train_data[valid_idx])))
    lgbm_preds += lgb_model.predict(test_data)/nfolds

print("mean rmse score",np.mean(lgbm_scores))

In [None]:
def plot_feature_importance(model):
    feature_importance = pd.DataFrame({"feature":features,"importance":model.feature_importance(importance_type='gain')})
    feature_importance = feature_importance.sort_values(by='importance',ascending=False)
    
    plt.figure(figsize=(10,10))
    plt.subplot(211)
    sns.barplot(data=feature_importance,x='importance',y='feature')
    
    for idx, v in enumerate(feature_importance.importance):
            plt.text(v, idx, "  {:.2e}".format(v))
    
    feature_importance = pd.DataFrame({"feature":features,"importance":model.feature_importance(importance_type='split')})
    feature_importance = feature_importance.sort_values(by='importance',ascending=False)
    
    plt.subplot(212)
    sns.barplot(data=feature_importance,x='importance',y='feature')
    
    for idx, v in enumerate(feature_importance.importance):
            plt.text(v, idx, "  {:.2e}".format(v))

In [None]:
plot_feature_importance(lgb_model)

## XGB

In [None]:
params = {'lambda': 0.0030282073258141168, 
         'alpha': 0.01563845128469084,
         'colsample_bytree': 0.55,
         'subsample': 0.7,
         'learning_rate': 0.01,
         'max_depth': 15,
         'random_state': 2020, 
         'min_child_weight': 257,
         }

In [None]:
xgb_preds = np.zeros(test_data.shape[0])

kfold = StratifiedKFold(n_splits=nfolds,random_state=seed)
params['random_state'] = seed
xgb_scores = list()
for train_idx, valid_idx in kfold.split(X=train_data,y=bins):
    xgb_train = xgb.DMatrix(train_data[train_idx],label=target[train_idx])
    xgb_valid = xgb.DMatrix(train_data[valid_idx],label=target[valid_idx])

    xgb_model = xgb.train(params,
                    xgb_train,
                    10000,
                    verbose_eval=200,
                    evals=[(xgb_train,'train'),(xgb_valid,'valid')],
                    early_stopping_rounds=100)
    xgb_scores.append(rmse_score(target[valid_idx],xgb_model.predict(xgb.DMatrix(train_data[valid_idx]))))
    xgb_preds += xgb_model.predict(xgb.DMatrix(test_data))/nfolds

print("mean rmse score",np.mean(xgb_scores))

## Catboost

In [None]:
params = {'l2_leaf_reg': 0.02247766515106271, 
          'max_bin': 364,
          'subsample': 0.6708650091202213,
             'learning_rate': 0.010290546311954876,
          'max_depth': 10,
           'verbose':200,
          'random_state': seed, 
          'min_data_in_leaf': 300,
            'loss_function': 'RMSE',
          'n_estimators':  25000,
          'rsm':0.5,
         'early_stopping_rounds':100}

In [None]:
cat_preds = np.zeros(test_data.shape[0])
kfold = StratifiedKFold(n_splits=nfolds,random_state=seed)
params['random_state'] = seed
cat_scores = list()
for train_idx, valid_idx in kfold.split(X=train_data,y=bins):
    cat_train = Pool(train_data[train_idx],target[train_idx])
    cat_valid = Pool(train_data[valid_idx],target[valid_idx])

    cat_model = CatBoost(params)
    cat_model.fit(cat_train,eval_set=cat_valid)
    cat_scores.append(rmse_score(target[valid_idx],cat_model.predict(train_data[valid_idx])))
    cat_preds += cat_model.predict(test_data)/nfolds
    
print('mean rmse score:',np.mean(cat_scores))

## Stacking 

[Notebook](https://www.kaggle.com/maunish/tps-simple-stacking)

In [None]:
stacking_preds = pd.read_csv('../input/tps-simple-stacking/submission.csv')
stacking_preds = stacking_preds.target.to_numpy()

## correlation matrix

In [None]:
predictions = pd.DataFrame({"lgbm":lgbm_preds,"xgboost":xgb_preds,'catboost':cat_preds,'stacking':stacking_preds})
plt.figure(figsize=(7,7))
sns.heatmap(predictions.corr(),annot=True);

## submission

In [None]:
sample.target = (0.6 * lgbm_preds.ravel() +  0.2 * xgb_preds.ravel() + 0.1 * cat_preds.ravel() + 0.1 * stacking_preds)
sample.to_csv("submission.csv",index=False)
sample.head()

In [None]:
plt.figure(figsize=(15,7))
plt.subplot(131)
sns.distplot(sample.target)
plt.title("test-target distribution")
plt.subplot(132)
sns.distplot(target)
plt.title("train-target distribution")
plt.subplot(133)
sns.distplot(sample.target.to_numpy(),label='test')
sns.distplot(target,label='target')
plt.legend()
plt.title("train and test target distribution");