# Tabular Playground (Feb 2021)

In [None]:
# Import the requires tools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid")
import plotly.express as px
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from mlxtend.regressor import StackingRegressor 
import optuna
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Import the data
train = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-feb-2021/test.csv")

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.describe()

## Data Visualization

In [None]:
# Count plot of the categorical features
fig, ax = plt.subplots(nrows=5, ncols=2, figsize=(14, 16))
ax0 = sns.countplot(train["cat0"], ax=ax[0][0])
ax1 = sns.countplot(train["cat1"], ax=ax[0][1])
ax2 = sns.countplot(train["cat2"], ax=ax[1][0])
ax3 = sns.countplot(train["cat3"], ax=ax[1][1])
ax4 = sns.countplot(train["cat4"], ax=ax[2][0])
ax5 = sns.countplot(train["cat5"], ax=ax[2][1])
ax6 = sns.countplot(train["cat6"], ax=ax[3][0])
ax7 = sns.countplot(train["cat7"], ax=ax[3][1])
ax8 = sns.countplot(train["cat8"], ax=ax[4][0])
ax9 = sns.countplot(train["cat9"], ax=ax[4][1])

### Distribution of the continous features

In [None]:
# PDF of the continous features
fig, ax = plt.subplots(nrows=7, ncols=2, figsize=(14, 24))
ax0 = sns.kdeplot(train["cont0"], ax=ax[0][0])
ax1 = sns.kdeplot(train["cont1"], ax=ax[0][1])
ax2 = sns.kdeplot(train["cont2"], ax=ax[1][0])
ax3 = sns.kdeplot(train["cont3"], ax=ax[1][1])
ax4 = sns.kdeplot(train["cont4"], ax=ax[2][0])
ax5 = sns.kdeplot(train["cont5"], ax=ax[2][1])
ax6 = sns.kdeplot(train["cont6"], ax=ax[3][0])
ax7 = sns.kdeplot(train["cont7"], ax=ax[3][1])
ax8 = sns.kdeplot(train["cont8"], ax=ax[4][0])
ax9 = sns.kdeplot(train["cont9"], ax=ax[4][1])
ax10 = sns.kdeplot(train["cont10"], ax=ax[5][0])
ax11 = sns.kdeplot(train["cont11"], ax=ax[5][1])
ax12 = sns.kdeplot(train["cont12"], ax=ax[6][0])
ax13 = sns.kdeplot(train["cont13"], ax=ax[6][1])

In [None]:
# CDF of the continous features
fig, ax = plt.subplots(nrows=7, ncols=2, figsize=(14, 24))
ax0 = sns.ecdfplot(train["cont0"], ax=ax[0][0])
ax1 = sns.ecdfplot(train["cont1"], ax=ax[0][1])
ax2 = sns.ecdfplot(train["cont2"], ax=ax[1][0])
ax3 = sns.ecdfplot(train["cont3"], ax=ax[1][1])
ax4 = sns.ecdfplot(train["cont4"], ax=ax[2][0])
ax5 = sns.ecdfplot(train["cont5"], ax=ax[2][1])
ax6 = sns.ecdfplot(train["cont6"], ax=ax[3][0])
ax7 = sns.ecdfplot(train["cont7"], ax=ax[3][1])
ax8 = sns.ecdfplot(train["cont8"], ax=ax[4][0])
ax9 = sns.ecdfplot(train["cont9"], ax=ax[4][1])
ax10 = sns.ecdfplot(train["cont10"], ax=ax[5][0])
ax11 = sns.ecdfplot(train["cont11"], ax=ax[5][1])
ax12 = sns.ecdfplot(train["cont12"], ax=ax[6][0])
ax13 = sns.ecdfplot(train["cont13"], ax=ax[6][1])

In [None]:
# Violin plot of the continous features
fig, ax = plt.subplots(nrows=7, ncols=2, figsize=(14, 24))
ax0 = sns.violinplot(train["cont0"], ax=ax[0][0], orient="v")
ax1 = sns.violinplot(train["cont1"], ax=ax[0][1], orient="v")
ax2 = sns.violinplot(train["cont2"], ax=ax[1][0], orient="v")
ax3 = sns.violinplot(train["cont3"], ax=ax[1][1], orient="v")
ax4 = sns.violinplot(train["cont4"], ax=ax[2][0], orient="v")
ax5 = sns.violinplot(train["cont5"], ax=ax[2][1], orient="v")
ax6 = sns.violinplot(train["cont6"], ax=ax[3][0], orient="v")
ax7 = sns.violinplot(train["cont7"], ax=ax[3][1], orient="v")
ax8 = sns.violinplot(train["cont8"], ax=ax[4][0], orient="v")
ax9 = sns.violinplot(train["cont9"], ax=ax[4][1], orient="v")
ax10 = sns.violinplot(train["cont10"], ax=ax[5][0], orient="v")
ax11 = sns.violinplot(train["cont11"], ax=ax[5][1], orient="v")
ax12 = sns.violinplot(train["cont12"], ax=ax[6][0], orient="v")
ax13 = sns.violinplot(train["cont13"], ax=ax[6][1], orient="v")

### Visualize the target variable with other features

In [None]:
fig, ax = plt.subplots(nrows=5, ncols=2, figsize=(12, 16))
ax0 = sns.boxplot(x=train["cat0"], y=train["target"], ax=ax[0][0]);
ax1 = sns.boxplot(x=train["cat1"], y=train["target"], ax=ax[0][1]);
ax2 = sns.boxplot(x=train["cat2"], y=train["target"], ax=ax[1][0]);
ax3 = sns.boxplot(x=train["cat3"], y=train["target"], ax=ax[1][1]);
ax4 = sns.boxplot(x=train["cat4"], y=train["target"], ax=ax[2][0]);
ax5 = sns.boxplot(x=train["cat5"], y=train["target"], ax=ax[2][1]);
ax6 = sns.boxplot(x=train["cat6"], y=train["target"], ax=ax[3][0]);
ax7 = sns.boxplot(x=train["cat7"], y=train["target"], ax=ax[3][1]);
ax8 = sns.boxplot(x=train["cat8"], y=train["target"], ax=ax[4][0]);
ax9 = sns.boxplot(x=train["cat9"], y=train["target"], ax=ax[4][1]);

In [None]:
# Distribution of the target variable
sns.histplot(train["target"]);

Most of the target values lies between 6 and 10


### Pearson correlation coffecient

In [None]:
cont_feature = train.iloc[:, 12:24]
cont_corr_p = cont_feature.corr(method="spearman")

In [None]:
sns.heatmap(cont_corr_p);

### Spearman coorelation coefficient

In [None]:
cont_corr_sp = cont_feature.corr(method="spearman")
sns.heatmap(cont_corr_sp);

From the above two heatmaps  features from cont5 to cont12 have some coorelation between them.

## Preprocessing

In [None]:
categorical_features = ["cat0", "cat1", "cat2", "cat3", "cat4", "cat5","cat6", "cat7", "cat8", "cat9"]

## Mean Encoding
In mean target encoding for each category in the feature label is decided with the mean value of the target variable on a training data. This encoding method brings out the relation between similar categories, but the connections are bounded within the categories and target itself. Smoothing is one of the variation of mean encoding.

In [None]:
# def smoothing(train, test):
    
#     # compute the mean
#     mean = train["target"].mean()
#     for i in categorical_features:
#         agg = train.groupby(i)["target"].agg(["count", "mean"])
#         count = agg["count"]
#         mean = agg["mean"]
#         weight = 10
        
#         # smoothed mean
#         smooth = (count * mean + weight * mean) / (count * weight)
        
#         train[i] = train[i].map(smooth)
#         test[i] = test[i].map(smooth)
    
#     return train, test

In [None]:
le = LabelEncoder()
for cat in categorical_features:
    train[cat] = le.fit_transform(train[cat])
    test[cat] = le.fit_transform(test[cat])

## Building the model

In [None]:
train = train[train["target"] > 4.0]

In [None]:
X = train.drop(["target", "id"],axis=1)
y = train["target"]

In [None]:
def K_fold_CV(X, y, model, params, folds=5):
    rmse_error = []
    skf = KFold(n_splits=folds, shuffle=True, random_state=42)
    for fold, (tr_idx, ts_idx) in enumerate(skf.split(X, y)):
        print(f"Fold: {fold}")
        x_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
        x_ts, y_ts = X.iloc[ts_idx], y.iloc[ts_idx]

        reg = model(**params)
        reg.fit(x_tr, y_tr,
                eval_set=[(x_ts, y_ts)],
                early_stopping_rounds=100,
                verbose=False)

        error = mean_squared_error(y_ts, reg.predict(x_ts), squared=False)
        rmse_error.append(error)
        print(f"RMSE Error: {error}")
        print("-"*25)
    
    return reg, np.mean(rmse_error)

## Lightgbm

In [None]:
param_lgb = {
    "random_state": 2021,
    "metric": "rmse",
    "n_jobs": -1,
    "early_stopping_round": 200,
    "reg_alpha": 9.03513073170552,
    "reg_lambda": 0.024555737897445917,
    "colsample_bytree": 0.2185112060137363,
    "learning_rate": 0.003049106861273527,
    "max_depth": 65,
    "num_leaves": 51,
    "min_child_samples": 177,
    "n_estimators": 1600000,
    "cat_smooth": 93.60968300634175,
    "max_bin": 537,
    "min_data_per_group": 117,
    "bagging_freq": 1,
    "bagging_fraction": 0.6709049555262285,
    "cat_l2": 7.5586732660804445,
}

In [None]:
%%time
model_lgb, lgb_error = K_fold_CV(X, y, LGBMRegressor, params=param_lgb, folds=7)

In [None]:
lgb_error

In [None]:
# plot importance features
lgb.plot_importance(model_lgb)

### CatBoostRegressor

In [None]:
param_cbr = {
    'loss_function': 'RMSE',
    "max_depth": 4,
    'learning_rate': 0.03,
    "bootstrap_type": 'Poisson',
    "subsample": 0.8,
    "border_count": 512,
    "l2_leaf_reg": 200,
    'random_state': 42,
    "thread_count": 2,
    'num_boost_round': 50000,
    "task_type": "GPU",
    "devices" : "0",
}

In [None]:
%%time
cbr, cbr_error = K_fold_CV(X, y, CatBoostRegressor, params=param_cbr, folds=7)

In [None]:
cbr_error

In [None]:
#cbr.feature_importances_
plt.bar(range(len(cbr.feature_importances_)), cbr.feature_importances_)
plt.show()

### XGBRegressor

In [None]:
param_xgb = {
    'booster':'gbtree',
    'n_estimators':2328,
    'learning_rate':0.014,
    'max_depth':7, 
    'eta':0.008,
    'gamma':3.5,
    'objective':'reg:squarederror',
    'verbosity':0,
    'subsample':0.7,
    'colsample_bytree':0.3,
    'reg_lambda':0.012137972665213192,
    'reg_alpha':0.627569414718695,
    'scale_pos_weight':1,
    'min_child_weight': 4.635026563569194,
    'eval_metric':'rmse',
    'tree_method':'gpu_hist',
    'gpu_id':0,
}

In [None]:
%%time
model_xgb, xgb_error = K_fold_CV(X, y, XGBRegressor, params=param_xgb, folds=7)

In [None]:
xgb_error

In [None]:
xgb.plot_importance(model_xgb)

## Hyperparameter Tuning using optuna

In [None]:
# x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# def objective(trial):
    
#     param = {
#         'random_state': 42,
#           'metric': 'rmse',
#           'n_estimators': trial.suggest_int("n_estimators", 100, 3000),
#           'n_jobs': -1,
#           'cat_feature': [x for x in range(len(categorical_features))],
#           'bagging_seed': 1024,
#           'feature_fraction_seed': 2024,
#           'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-2),
#           'max_depth': trial.suggest_int('max_depth', 6, 127),
#           'num_leaves': trial.suggest_int('num_leaves', 31, 128),
#           'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0),
#           'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0),
#           'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 0.9),
#           'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
#           'subsample_freq': trial.suggest_int('subsample_freq', 1, 10),
#           'subsample': trial.suggest_float('subsample', 0.3, 0.9),
#           'max_bin': trial.suggest_int('max_bin', 128, 1024),
#           'min_data_per_group': trial.suggest_int('min_data_per_group', 50, 200),
#           'cat_smooth': trial.suggest_int('cat_smooth', 10, 100),
#           'cat_l2': trial.suggest_int('cat_l2', 1, 20)
#     }
#     model = LGBMRegressor(**param)  
#     model.fit(x_train,y_train,eval_set=[(x_test,y_test)],early_stopping_rounds=100,verbose=False)
#     preds = model.predict(x_test, num_iteration=model.best_iteration_)
#     rmse = mean_squared_error(y_test, preds,squared=False)
    
#     return rmse

In [None]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=40)
# param_xgb = study.best_params

In [None]:
params = {
    "random_state": 2021,
    "metric": "rmse",
    "n_jobs": -1,
    'n_estimators': 1544, 
     'learning_rate': 0.009664135308408987, 
     'max_depth': 15, 
     'num_leaves': 119, 
     'reg_alpha': 3.966388879825433, 
     'reg_lambda': 0.7124404813240152, 
     'colsample_bytree': 0.2941761921420618, 
     'min_child_samples': 177, 
     'subsample_freq': 5, 
     'subsample': 0.8282629920254261, 
     'max_bin': 322, 
     'min_data_per_group': 110, 
     'cat_smooth': 23, 
     'cat_l2': 4
}

In [None]:
%%time
model_lgb_new, error = K_fold_CV(X, y, LGBMRegressor, params=params, folds=7)

In [None]:
print(error)

## Weighted Average Ensemble

In [None]:
submission = pd.read_csv("../input/tabular-playground-series-feb-2021/sample_submission.csv")

In [None]:
# Make predictions for the base models
pred_lgb = model_lgb.predict(test[X.columns])
pred_cbr = cbr.predict(test[X.columns])
pred_xgb = model_xgb.predict(test[X.columns])

In [None]:
# Average Ensembe
pred_avg = (pred_lgb + pred_cbr + pred_xgb) / 3

In [None]:
# Weighted Average 
pred_1 = 0.5 * pred_lgb + 0.1 * pred_cbr + 0.4 * pred_xgb
pred_2 = 0.6 * pred_lgb + 0.2 * pred_cbr + 0.2 * pred_xgb 
pred_3 = 0.6 * pred_xgb + 0.4 * pred_lgb

In [None]:
# Create submission files

# Average Ensemble
submission["target"] = pred_avg
submission.to_csv("Average_ensemble.csv", index=False)

# Weighted Ensemble
submission["target"] = pred_1
submission.to_csv("weighted_ensemble_1.csv", index=False)

submission["target"] = pred_2
submission.to_csv("weighted_ensemble_2.csv", index=False)

submission["target"] = pred_3
submission.to_csv("weighted_ensemble_3.csv", index=False)

submission["target"] = pred_xgb
submission.to_csv("XGB.csv", index=False)

submission["target"] = pred_lgb
submission.to_csv("LGB.csv", index=False)

submission["target"] = pred_cbr
submission.to_csv("CBR.csv", index=False)