In [None]:
#standard
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil
from tqdm.notebook import tqdm
from pathlib import Path

#sklearn data_preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
#sklearn categorical encoding
import category_encoders as ce

#sklearn modelling
from sklearn.model_selection import KFold

# boosting library
import lightgbm as lgb
import xgboost as xgb

import warnings
warnings.filterwarnings("ignore")

data = pd.read_csv("../input/tabular-playground-series-feb-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-feb-2021/test.csv")


In [None]:
ROOT = Path.cwd().parent
INPUT = ROOT / "input"
DATA = INPUT / "tabular-playground-series-feb-2021"
WORK = ROOT / "working"

for path in DATA.iterdir():
    print(path.name)

In [None]:
train = pd.read_csv(DATA / "train.csv")
test = pd.read_csv(DATA / "test.csv")
smpl_sub = pd.read_csv(DATA / "sample_submission.csv")
print("train: {}, test: {}, sample sub: {}".format(
    train.shape, test.shape, smpl_sub.shape
))

In [None]:
data.describe(include = "all")

In [None]:
# Set id as as index
data.set_index("id",inplace=True)
test.set_index("id",inplace=True)

In [None]:
# we have 1 id, 10 categorical variables, 14 continuous variables
cat_feats = data.iloc[:,0:10].columns
numeric_feats = data.iloc[:,10:-1].columns
train = data.iloc[:,:-1]
target = data.iloc[:,-1].values

## **Function defining**

In [None]:
def rmse(y_true, y_pred):
    """calculate rmse"""
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

In [None]:
def enc_scl_pipe(X_train, y_train, X_test, enc_method, scaler = StandardScaler()): 
    X_train_encoded = X_train.copy()
    X_test_encoded= X_test.copy()
    # Set up feature to encode
    feature_to_encode = X_train.columns[X_train.dtypes == 'O'].tolist()
    
    if enc_method == 'label':
        for feat in feature_to_encode:
            # Initia the encoder model
            lbEncoder = LabelEncoder()
            # fit the train data
            lbEncoder.fit(X_train[feat])

            # transform training set
            X_train_encoded[feat] = lbEncoder.transform(X_train[feat])
            # transform test set
            X_test_encoded[feat] = lbEncoder.transform(X_test[feat])
            
    elif enc_method == 'glmm':
        # Initia the encoder model
        GLMMEncoder = ce.glmm.GLMMEncoder(verbose =0 ,binomial_target=False)
        # fit the train data
        GLMMEncoder.fit(X_train[feature_to_encode],y_train)
        # transform training set
        X_train_encoded[feature_to_encode] = GLMMEncoder.transform(X_train[feature_to_encode])
        # transform test set
        X_test_encoded[feature_to_encode] = GLMMEncoder.transform(X_test[feature_to_encode])
    else:
        raise 'No encoding method stated'
        
    # fit the scaler                    
    scaler.fit(X_train_encoded)
    # transform training set
    X_train_scaled = pd.DataFrame(scaler.transform(X_train_encoded), columns=X_train_encoded.columns, index=X_train_encoded.index)
    # transform test set
    X_test_scaled = pd.DataFrame(scaler.transform(X_test_encoded), columns=X_test_encoded.columns, index=X_test_encoded.index)
    
    return X_train_scaled, X_test_scaled, feature_to_encode

In [None]:
def kfold_CV_pipe(x_train, y_train, test, model, columns,enc_method, n_splits = 10):
    all_scores = []
    # set up k-fold
    kf = KFold(n_splits=n_splits)
    # all data
    _, test,__ = enc_scl_pipe(x_train,y_train, test,enc_method = enc_method)
    
    train_oof_preds = np.zeros((300000,))
    test_preds = 0
    for  f, (trn_idx, val_idx) in tqdm(enumerate(kf.split(x_train, y_train))):
        # set up the splitted data
        train       , val        = x_train.iloc[trn_idx][columns], x_train.iloc[val_idx][columns]
        train_target, val_target = y_train[trn_idx]              , y_train[val_idx]
        
        # encode
        # k-fold data
        train, val, categorical_feats = enc_scl_pipe(train,train_target, val, enc_method = enc_method)        
        # model fitting
      
        model.fit(train, train_target, eval_set=[(val, val_target)],
                          eval_metric=['rmse'],
                          early_stopping_rounds=2000,                        
                          verbose=0)

        # get predicted values for oof data and whole test set
        temp_oof = model.predict(val)
        temp_test = model.predict(test[columns])
        # get predicted values for whole data set aggregate from each fold iter
        train_oof_preds[val_idx] = temp_oof
        test_preds += temp_test/n_splits
        
        fold_score = rmse(val_target,temp_oof)
        all_scores.append(fold_score)
        print(fold_score)        
    return  train_oof_preds, test_preds, all_scores

In [None]:
# LightGBM base
lgb_params_base = {'subsample': 0.8,
             'learning_rate': 0.5,
             'max_depth': 30,
             'num_leaves': 80,
             'min_child_samples': 100,
             'random_state': 22,
             'n_estimators': 100,
             'metric': 'rmse',
             'max_bin': 600, 
             'cat_l2': 4,
             'cat_smooth': 79}

# LightGBM params
lgb_params = {'reg_alpha': 6.15,
             'reg_lambda': 0.0025,
             'colsample_bytree': 0.3,
             'subsample': 0.8,
             'learning_rate': 0.004,
             'max_depth': 15,
             'num_leaves': 80,
             'min_child_samples': 250,
             'random_state': 22,
             'n_estimators': 20000,
             'metric': 'rmse',
             'max_bin': 600, 
             'cat_smooth': 50}
# LightGBM params 2
lgb_params_2 = {'max_depth': 16, 
                'subsample': 0.8032697250789377, 
                'colsample_bytree': 0.21067140508531404, 
                'learning_rate': 0.003,
                'reg_lambda': 5.25, 
                'reg_alpha': 8.2914, 
                'min_child_samples': 31, 
                'num_leaves': 320, 
                'max_bin': 522, 
                'cat_smooth': 81, 
                'cat_l2': 0.029690334194270022, 
                'metric': 'rmse', 
                'n_jobs': -1, 
                'n_estimators': 20000}

# xgboost
xgb_params = {
                'booster':'gbtree',
                'n_estimators':20000,
                'max_depth':11, 
                "learning_rate": 0.009,
                'gamma':3.5,
                'objective':'reg:squarederror',
                'verbosity':0,
                'subsample':0.65,
                'colsample_bytree':0.3,
                'reg_lambda':0.5,
                'reg_alpha':8,
                'scale_pos_weight':1,
                'objective':'reg:squarederror',
                'eval_metric':'rmse',
                'seed': 22,
                'tree_method':'gpu_hist',
                'gpu_id':0
                }

In [None]:
def feature_engineering(data):
    new_data = data.copy()
    new_data['cat2p6'] = new_data['cat2'] + new_data['cat6']
    new_data['cat6p1'] = new_data['cat6'] + new_data['cat1']
    new_data['cat2p1'] = new_data['cat2'] + new_data['cat1']
    
    new_data['cont0p8'] = new_data['cont0']*new_data['cont8']
    new_data['cont0p5'] = new_data['cont0']*new_data['cont5']
    new_data['cont11p8'] = new_data['cont11']*new_data['cont8']
    return new_data

new_train = feature_engineering(train)
new_test = feature_engineering(test)

print(new_train.shape)
print(new_test.shape)

In [None]:
# lgb
print("lgb base CV scores label")
model = lgb.LGBMRegressor(**lgb_params_base)
lgb_train_oof, lgb_test_preds, lgb_all_scores = kfold_CV_pipe(train, target, test, model,enc_method = 'label', columns = train.columns)
oof_score = rmse(target, lgb_train_oof)
print(f"lgb oof score: {oof_score:.6f}")
print("---------------------------------------------------------------")
# lgb
print("lgb base CV scores glmm")
model = lgb.LGBMRegressor(**lgb_params_base)
lgb_train_oof, lgb_test_preds, lgb_all_scores = kfold_CV_pipe(train, target, test, model,enc_method = 'glmm' ,columns = train.columns)
oof_score = rmse(target, lgb_train_oof)
print(f"lgb oof score: {oof_score:.6f}")
print("---------------------------------------------------------------")
# lgb
print("lgb base new features CV scores")
model = lgb.LGBMRegressor(**lgb_params_base)
lgb_train_oof, lgb_test_preds, lgb_all_scores = kfold_CV_pipe(new_train, target, new_test, model,enc_method = 'glmm', columns = new_train.columns)
oof_score = rmse(target, lgb_train_oof)
print(f"lgb oof score: {oof_score:.6f}")

In [None]:
print("---------------------------------------------------------------")
print("===============================================================")
print("---------------------------------------------------------------")
# lgb
print("lgb1 CV scores")
model = lgb.LGBMRegressor(**lgb_params)
lgb1_train_oof, lgb1_test_preds, lgb1_all_scores = kfold_CV_pipe(train, target, test, model,enc_method = 'glmm', columns = train.columns)
oof_score = rmse(target, lgb1_train_oof)
print(f"lgb1 oof score: {oof_score:.6f}")

print("---------------------------------------------------------------")
print("===============================================================")
print("---------------------------------------------------------------")
# lgb
print("lgb2 CV scores")
model = lgb.LGBMRegressor(**lgb_params_2)
lgb2_train_oof, lgb2_test_preds, lgb2_all_scores = kfold_CV_pipe(train, target, test, model,enc_method = 'glmm', columns = train.columns)
oof_score = rmse(target, lgb2_train_oof)
print(f"lgb2 oof score: {oof_score:.6f}")

print("---------------------------------------------------------------")
print("===============================================================")
print("---------------------------------------------------------------")
print("xgb CV scores")
model =  xgb.XGBRegressor(**xgb_params)
xgb_train_oof, xgb_test_preds, xgb_all_scores = kfold_CV_pipe(train, target, test, model,enc_method = 'glmm', columns = train.columns)
oof_score = rmse(target, xgb_train_oof)
print(f"xgb oof score: {oof_score:.6f}")

In [None]:
sub = smpl_sub.copy()
# sub[TGT_COL] = test_pred_avg
sub['target'] = lgb1_test_preds

sub.to_csv("lgb1_submission.csv", index=False)


In [None]:
sub = smpl_sub.copy()
# sub[TGT_COL] = test_pred_avg
sub['target'] = lgb2_test_preds

sub.to_csv("lgb2_submission.csv", index=False)

In [None]:
sub = smpl_sub.copy()
# sub[TGT_COL] = test_pred_avg
sub['target'] = xgb_test_preds

sub.to_csv("xgb_submission.csv", index=False)

In [None]:
weights = [0.2, 0.4, 0.4]

oof_pred_wavg = weights[0]*lgb1_train_oof + weights[1]*lgb2_train_oof + weights[2]*xgb_train_oof
oof_score_wavg = rmse(target, oof_pred_wavg)

print(f"oof score weighted avg: {oof_score_wavg:.6f}")

test_pred_wavg = weights[0]*lgb1_test_preds + weights[1]*lgb2_test_preds + weights[2]*xgb_test_preds

sub = smpl_sub.copy()
# sub[TGT_COL] = test_pred_avg
sub['target'] = test_pred_wavg

sub.to_csv("wavg_submission.csv", index=False)