# Tabular Playground Series(March 2021)

In [None]:
# Import the requires tools
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid")

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import plot_confusion_matrix, roc_auc_score, plot_roc_curve, classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV

import lightgbm
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier

import optuna
import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2021/test.csv")

In [None]:
train

In [None]:
train.describe()

In [None]:
train.info()

## Data Visualization

In [None]:
train.columns

In [None]:
cont_features = list(train.columns[20:-1])
cat_features = list(train.columns[1:20])

In [None]:
def visualize_features(seaborn_plot, features:list, num_rows, num_cols, fig_size:tuple, **kwargs):
    
    fig, ax = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=fig_size)
    c = 1
    for i in features:
        plt.subplot(num_rows, num_cols, c)
        seaborn_plot(x=train[i], **kwargs)
        c = c + 1

    plt.delaxes(ax[num_rows-1, num_cols-1])
    plt.show()

### Continous Features

In [None]:
visualize_features(sns.histplot, cont_features, 6, 2, (14, 25), kde=True)

In [None]:
visualize_features(sns.ecdfplot, cont_features, 6, 2, (14, 25))

In [None]:
visualize_features(sns.violinplot, cont_features, 6, 2, (14, 25))

### Categorical features

## Check for coorelation

In [None]:
cont_feat = train.iloc[:, 20:-1]
cont_corr_p = cont_feat.corr(method="spearman")
sns.heatmap(cont_corr_p);

## Preprocessing

### Mean Encoding

In mean target encoding for each category in the feature label is decided with the mean value of the target variable on a training data. This encoding method brings out the relation between similar categories, but the connections are bounded within the categories and target itself. Smoothing is one of the variation of mean encoding.

In [None]:
def smoothing(train, test):
    
    # compute the mean
    mean = train["target"].mean()
    for i in cat_features:
        agg = train.groupby(i)["target"].agg(["count", "mean"])
        count = agg["count"]
        mean = agg["mean"]
        weight = 10
        
        # smoothed mean
        smooth = (count * mean + weight * mean) / (count * weight)
        
        train[i] = train[i].map(smooth)
        test[i] = test[i].map(smooth)
    
    return train, test

In [None]:
train, test = smoothing(train, test)

## Modelling

In [None]:
# check for data imbalance
sns.countplot(train["target"])

In [None]:
X = train.drop(["id", "target"], axis=1)
y = train["target"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
def K_fold_CV(X, y, model, params, folds=5):
    roc_score = []
    # Using Stratified K-fold CV for preserving the percentage of samples for each classes
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    for fold, (tr_idx, ts_idx) in enumerate(skf.split(X, y)):
        print(f"Fold: {fold}")
        x_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
        x_ts, y_ts = X.iloc[ts_idx], y.iloc[ts_idx]

        clf = model(**params)
        clf.fit(x_tr, y_tr,
                eval_set=[(x_ts, y_ts)],
                early_stopping_rounds=100,
                verbose=False)

        score = roc_auc_score(y_ts, clf.predict_proba(x_ts)[:, 1])
        roc_score.append(score)
        print(f"ROC AUC Score: {score}")
        print("-"*25)
    
    return clf, np.mean(roc_score)

### LGBMClassifier

In [None]:
lgb_params = {
    'reg_alpha': 4.203457823159052, 
    'reg_lambda': 6.34173530304477, 
    'num_leaves': 148,
    'min_child_samples': 55, 
    'max_depth': 16, 
    'learning_rate': 0.01, 
    'colsample_bytree': 0.22290988791359692,
    'n_estimators': 2703, 
    'cat_smooth': 37, 
    'cat_l2': 10, 
    'min_data_per_group': 97, 
    'device': 'gpu',
    'random_state': 26, 
    'n_jobs': -1, 
    'boosting_type': 'gbdt', 
    'metric': 'AUC'}

In [None]:
clf, score = K_fold_CV(X, y, LGBMClassifier, lgb_params, 7)

In [None]:
print(score)

### CatBoost Classifier

In [None]:
cb_params = {
    "verbose":0,
    "eval_metric":"AUC",
    "loss_function":"Logloss",
    "random_state":2021,
    "num_boost_round":20000,
    "od_type":"Iter",
    "od_wait":200,
    "task_type":"GPU",
    "devices":"0",
    "bagging_temperature":1.288692494969795,
    "grow_policy":"Depthwise",
    "l2_leaf_reg":9.847870133539244,
    "learning_rate":0.01877982653902465,
    "max_depth":8,
    "min_data_in_leaf":1,
    "penalties_coefficient":2.1176668909602734,
}

In [None]:
clf_cb, cb_score = K_fold_CV(X, y, CatBoostClassifier, cb_params, 7)

In [None]:
print(cb_score)

### XGBClassifier

In [None]:
xgb_param = {
    "seed":42,
    "n_estimators":10000,
    "verbosity":1,
    "eval_metric":"auc",
    "tree_method":"gpu_hist",
    "gpu_id":0,
    "alpha":7.105038963844129,
    "colsample_bytree":0.25505629740052566,
    "gamma":0.4999381950212869,
    "reg_lambda":1.7256912198205319,
    "learning_rate":0.011823142071967673,
    "max_bin":338,
    "max_depth":8,
    "min_child_weight":2.286836198630466,
    "subsample":0.618417952155855
}

In [None]:
clf_xgb, xgb_score = K_fold_CV(X, y, XGBClassifier, xgb_param, 7)

In [None]:
print(xgb_score)

In [None]:
# # train predictions
# train_pred_lgbm = clf.predict_proba(X)
# train_pred_cb = clf_cb.predict_proba(X)
# train_pred_xgb = clf_xgb.predict_proba(X)

# # test predictions
# test_pred_lgbm = clf.predict_proba(test[X.columns])
# test_pred_cb = clf_cb.predict_proba(test[X.columns])
# test_pred_xgb = clf_xgb.predict_proba(test[X.columns])

## Pseudo Labelling

In [None]:
test_lgb = test.copy()

In [None]:
pred_lgb = clf.predict_proba(test_lgb[X.columns])[:, 1]
test_lgb["target"] = pred_lgb

In [None]:
test2_lgb = test_lgb[ (test_lgb['target']<=0.01) | (test_lgb['target']>=0.99) ].copy()
test2_lgb.loc[ test2_lgb['target']>=0.5, 'target' ] = 1
test2_lgb.loc[ test2_lgb['target']<0.5, 'target' ] = 0 

In [None]:
test2_lgb

In [None]:
test2_lgb["target"] = test2_lgb["target"].astype('int')

In [None]:
train_lgb = pd.concat([train,test2_lgb],axis=0)

In [None]:
x_lgb = train_lgb.drop(["id", "target"], axis=1)
y_lgb = train_lgb["target"]

In [None]:
x_lgb.columns

In [None]:
clf2_lgbm, clf2_error  = K_fold_CV(x_lgb, y_lgb, LGBMClassifier, lgb_params, 7)

## Make submission

In [None]:
pred_lgb = clf.predict_proba(test[X.columns])[:, 1]
pred_lgb2 = clf2_lgbm.predict_proba(test[X.columns])[:, 1]
pred_xgb = clf_xgb.predict_proba(test[X.columns])[:, 1]
pred_cb = clf_cb.predict_proba(test[X.columns])[:, 1]

In [None]:
# Blending
pred_blend1 = 0.5 * pred_lgb + 0.5 * pred_lgb2
pred_blend2 = 0.5 * pred_lgb + 0.5 * pred_xgb
pred_blend3 = 0.5 * pred_lgb + 0.5 * pred_cb

In [None]:
submission = pd.read_csv("../input/tabular-playground-series-mar-2021/sample_submission.csv")

In [None]:
# LGB 
submission["target"] = pred_lgb
submission.to_csv("LGB.csv", index=False)

# LGB2 
submission["target"] = pred_lgb2
submission.to_csv("LGB2.csv", index=False)

# XGB
submission["target"] = pred_xgb
submission.to_csv("XGB.csv", index=False)

#CB
submission["target"] = pred_cb
submission.to_csv("CB.csv", index=False)

# Blend 1
submission["target"] = pred_blend1
submission.to_csv("Blend1.csv", index=False)

# Blend 2
submission["target"] = pred_blend2
submission.to_csv("Blend2.csv", index=False)

# Blend 3
submission["target"] = pred_blend3
submission.to_csv("Blend3.csv", index=False)