In [None]:
import os
import gc
import sys
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
import datatable as dt
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import optuna
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor
import xgboost as xgb
from catboost import CatBoostRegressor, Pool, CatBoost

from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
sr_ = Style.RESET_ALL

import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.preprocessing import RobustScaler, QuantileTransformer, StandardScaler,PowerTransformer
from sklearn.decomposition import PCA

In [None]:
path = '../input/tabular-playground-series-jan-2021/'
train_data = pd.read_csv(path + 'train.csv')
test_data = pd.read_csv(path + 'test.csv')
sample = pd.read_csv(path + 'sample_submission.csv')

In [None]:
train_data['cont13_cont4_mul'] = train_data['cont13']*train_data['cont4']
train_data['cont13_cont11_mul'] = train_data['cont13']*train_data['cont11']
train_data['cont13_cont7_mul'] = train_data['cont13']*train_data['cont7']
train_data['cont13_cont2_mul'] = train_data['cont13']*train_data['cont2']
train_data['cont13_cont10_mul'] = train_data['cont13']*train_data['cont10']

test_data['cont13_cont4_mul'] = test_data['cont13']*test_data['cont4']
test_data['cont13_cont11_mul'] = test_data['cont13']*test_data['cont11']
test_data['cont13_cont7_mul'] = test_data['cont13']*test_data['cont7']
test_data['cont13_cont2_mul'] = test_data['cont13']*test_data['cont2']
test_data['cont13_cont10_mul'] = test_data['cont13']*test_data['cont10']

In [None]:
num_bins = int(1 + np.log2(len(train_data)))
train_data.loc[:,'bins'] = pd.cut(train_data['target'].to_numpy(),bins=num_bins,labels=False)

features = [f'cont{x}' for x in range(1,15)]
features += [
    'cont13_cont4_mul',
    'cont13_cont11_mul',
    'cont13_cont7_mul',
    'cont13_cont2_mul',
    'cont13_cont10_mul',
]
target_feature = 'target'

train_data = train_data.query('target >=5')
bins = train_data['bins'].to_numpy()

target = train_data[target_feature].to_numpy()
train_data = train_data[features].to_numpy()
test_data = test_data[features].to_numpy()

scaler = PowerTransformer()
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)

In [None]:
def rmse_score(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [None]:
nfolds = 5
seed = 42

lgb_params={'objective':'regression',
            'metrics':'rmse',
            'boosting':'gbdt',
            'min_data_per_group': 5,
            'num_leaves': 256,
            'max_depth': -1,
            'learning_rate': 0.005,
            'subsample_for_bin': 200000,
            'lambda_l1': 1.074622455507616e-05,
            'lambda_l2': 2.0521330798729704e-06,
            'n_jobs': -1,
            'cat_smooth': 1.0,
            'verbose':-1,
            'feature_pre_filter': False,
            'bagging_fraction': 0.8206341150202605,
            'min_data_in_leaf': 100,
            'min_sum_hessian_in_leaf': 0.001,
            'bagging_freq': 6,
            'feature_fraction': 0.5,
            'min_gain_to_split': 0.0,
            'min_child_samples': 20}

xgb_params = {'lambda': 0.0030282073258141168, 
         'alpha': 0.01563845128469084,
         'colsample_bytree': 0.55,
         'subsample': 0.7,
         'learning_rate': 0.01,
         'max_depth': 15,
         'random_state': 2020, 
         'min_child_weight': 257,
         }

cat_params = {'iterations': 4000,
              'depth': 7, 
              'verbose':200,
              'learning_rate': 0.03618554870098452, 
              'random_strength': 29, 
              'bagging_temperature': 0.5528282676740153,
              'od_type': 'IncToDec',
              'early_stopping_rounds':100,
              'random_state': seed
             }

lgb_final_params={'objective':'regression',
            'metrics':'rmse',
            'num_leaves' : 287,
            'max_bin' : 817,
            'learning_rate' : 0.017005497730644668,
            'max_depth' : 5,
            'min_child_weight' : 5,
            'feature_fraction' : 0.40159350277696726,
            'bagging_fraction' : 0.780876382367899,
            'bagging_freq' : 8,
            'min_child_samples' : 56,
            'lambda_l1' : 0.011468211453769755,
            'lambda_l2' : 0.00026850241464538264,
            'boosting':'gbdt',
            'verbose':-1}

In [None]:
final_preds = np.zeros(test_data.shape[0])
kfold = StratifiedKFold(n_splits=nfolds,random_state=seed)

for f, (train_idx, valid_idx) in enumerate(kfold.split(X=train_data,y=bins)):
    print(f"Fold: {f}")
    X_train, X_valid, y_train, y_valid = train_data[train_idx],train_data[valid_idx],target[train_idx],target[valid_idx]
    
    print(f"{r_}LGBM TRAIN{sr_}")
    lgb_train = lgb.Dataset(X_train,y_train)
    lgb_valid = lgb.Dataset(X_valid,y_valid,reference=lgb_train)
    lgb_model = lgb.train(lgb_params,
                          lgb_train,
                          valid_sets=[lgb_train,lgb_valid],
                          num_boost_round=10000,
                          verbose_eval=200,
                          early_stopping_rounds=100,
                          )
    
    print(f"{g_}XGB TRAIN{sr_}")
    xgb_train = xgb.DMatrix(X_train,label=y_train)
    xgb_valid = xgb.DMatrix(X_valid,label=y_valid)

    xgb_model = xgb.train(xgb_params,
                          xgb_train,
                          10000,
                          verbose_eval=200,
                          evals=[(xgb_train,'train'),(xgb_valid,'valid')],
                          early_stopping_rounds=100)
    
    print(f"{y_}CAT TRAIN{sr_}")
    cat_train = Pool(X_train,y_train)
    cat_valid = Pool(X_valid,y_valid)
    
    cat_model = CatBoost(cat_params)
    cat_model.fit(cat_train,eval_set=cat_valid)
    
    print("STACKING")
    final_train = np.column_stack([lgb_model.predict(X_train),xgb_model.predict(xgb.DMatrix(X_train)),cat_model.predict(X_train)])
    final_valid = np.column_stack([lgb_model.predict(X_valid),xgb_model.predict(xgb.DMatrix(X_valid)),cat_model.predict(X_valid)])
    final_test =  np.column_stack([lgb_model.predict(test_data),xgb_model.predict(xgb.DMatrix(test_data)),cat_model.predict(test_data)])
    
    print(f"{b_}FINAL TRAINING{sr_}")
    lgb_train = lgb.Dataset(final_train,y_train)
    lgb_valid = lgb.Dataset(final_valid,y_valid)
    lgb_final = lgb.train(lgb_final_params, lgb_train, valid_sets=[lgb_train,lgb_valid],early_stopping_rounds=100,
                          verbose_eval=200, num_boost_round=1000)
    
    final_preds += lgb_final.predict(final_test)/nfolds

In [None]:
sample.target = final_preds.ravel()
sample.to_csv("submission.csv",index=False)
sample.head()

In [None]:
plt.figure(figsize=(15,7))
plt.subplot(131)
sns.distplot(sample.target)
plt.title("test target distribution")
plt.subplot(132)
sns.distplot(target)
plt.title("train target distribution")
plt.subplot(133)
sns.distplot(np.append(sample.target.to_numpy(),target))
plt.title("train-test target distribution");