# Meta model building 
Following models are going to be used for building meta model
1. XGBoost
2. CatBoost
3. LGBM

### I have created a dataset, with 5 folds split. 
### Check it out here -> https://www.kaggle.com/tharunreddy/tpsnovember-5-fold-data-split

In [None]:
# imports
from pathlib import Path
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
input_path = Path('/kaggle/input/tabular-playground-series-nov-2021/')
list(input_path.iterdir())

In [None]:
train_df = pd.read_csv(input_path/'train.csv')

# import the 5-fold data that is prepared
train_5fold_df = pd.read_csv('/kaggle/input/tpsnovember-5-fold-data-split/train_df_5fold.csv')
test_df = pd.read_csv(input_path/'test.csv')
submission_df = pd.read_csv(input_path/'sample_submission.csv')

train_df.shape, train_5fold_df.shape, test_df.shape

## Data sanity check

In [None]:
# null values
train_df.isnull().sum().sum(), test_df.isnull().sum().sum()

In [None]:
# duplicates check
len(train_df) - len(train_df.drop(['id', 'target'], axis=1).drop_duplicates())

## EDA

1. It is said that, all the variables are continuous
2. Target variable is binary

In [None]:
test_id = test_df.loc[:, 'id']
train_target = train_df.loc[:, 'target']
# train_df.drop(['id', 'target'], axis=1, inplace=True)

In [None]:
train_target_counts = train_target.value_counts()
labels = train_target_counts.index
counts = train_target_counts.values

plt.bar(labels, counts, width=0.4)
plt.xticks(labels)
plt.show()

# XGBoost model build

In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import optuna
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, auc, roc_auc_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_df.drop(['id', 'target'], axis=1), train_target, test_size=0.25,
                                                    stratify=train_target, random_state=13)
X_train.shape, X_test.shape

In [None]:
def objective(trial):
    """
    Objective function to tune XGBoost classifier
    """
    params = {
        'tree_method': 'gpu_hist',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        # 'eta': trial.suggest_float('eta', 1e-8, 1., log=True),
        'eta': trial.suggest_float('eta', 0.01, 0.3),
        'gamma': trial.suggest_float('gamma', 1e-8, 1., log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 9, step=2),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 10),
        'alpha': trial.suggest_float('alpha', 1e-8, 1., log=True),
        'subsample': trial.suggest_float('subsample', 0.2, 1.),
        'colsample_bytree': trial.suggest_float('colsample_bytree',0.2, 1.),
        'use_label_encoder': False,
    }
    
    # KFold split
    skf = StratifiedKFold(n_splits=5, random_state=13, shuffle=True)
    cv_scores = []
    
    for train_ix, test_ix in skf.split(X_train, y_train):
        X_train_k, X_test_k = X_train.iloc[train_ix], X_train.iloc[test_ix]
        y_train_k, y_test_k = y_train.iloc[train_ix], y_train.iloc[test_ix]
    
        booster = XGBClassifier(**params)
        booster.fit(X_train_k, y_train_k, eval_metric='auc', eval_set=[(X_test_k, y_test_k)], verbose=0, early_stopping_rounds=100)
        preds = booster.predict_proba(X_test)
        preds = preds[:, 1]
        cv_scores.append(preds)
    cv_score = np.mean(cv_scores, axis=0)
    return roc_auc_score(y_test, cv_score)

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, timeout=600)

In [None]:
print(f'Best score: {study.best_value}')
print(f'Best params: {study.best_params}')
xgboost_best_params = study.best_params

In [None]:
def get_valid_test_preds(model, best_params):
    """
    build models on separate folds, and predict on hold-out set, using parameters studied from optuna
    return:
    valid_preds(pd dataframe): All the hold-out dataset predictions, concatenated
    test_preds(pd dataframe): Mean of Test predictions by all the models
    """
    # save X_test predictions by 'id'
    X_test_preds_final = dict()
    # save actual test_preds
    test_preds_final = []
    n_folds = 5

    for fold_no in range(n_folds):
        # X_train data prep
        X_train_ix = train_5fold_df[train_5fold_df.fold_no != fold_no].index
        X_train = train_df.loc[X_train_ix, :]
        y_train = X_train.target
        X_train.drop(['id', 'target'], axis=1, inplace=True)  # drop id & target

        # X_test data prep
        X_test_ix = train_5fold_df[train_5fold_df.fold_no == fold_no].index
        X_test = train_df.loc[X_test_ix, :]
        y_test = X_test.target
        X_test_ids = X_test.id  # save ids of each test id in each fold
        X_test.drop(['id', 'target'], axis=1, inplace=True)

        # train model 
        if model == 'xgboost':
            booster = XGBClassifier(**best_params, use_label_encoder=False, tree_method='gpu_hist',
                                    objective='binary:logistic', eval_metric='auc')
            booster.fit(X_train, y_train, eval_metric='auc', eval_set=[(X_test, y_test)], verbose=0, early_stopping_rounds=100)
        elif model == 'lightgbm':
            booster = LGBMClassifier(**best_params, eval_metric='auc')
            booster.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=0, early_stopping_rounds=100)
        elif model == 'catboost':
            booster = CatBoostClassifier(**best_params, loss_function='Logloss', eval_metric='AUC')
            booster.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=0)

        # predict on X_test
        X_test_preds = booster.predict_proba(X_test)
        X_test_preds = X_test_preds[:, 1]
        X_test_preds_final.update(zip(X_test_ids, X_test_preds))
        print(f'Fold no: {fold_no}: roc_score: {roc_auc_score(y_test, X_test_preds)}')

        # predict on test_df
        test_preds = booster.predict_proba(test_df.drop('id', axis=1))
        test_preds = test_preds[:, 1]
        test_preds_final.append(test_preds)

    test_preds_final = np.mean(test_preds_final, axis=0)
    X_test_preds_df = pd.DataFrame.from_dict(X_test_preds_final, orient='index').reset_index()
    X_test_preds_df.columns = ['id', 'target']
    test_preds_df = pd.DataFrame()
    test_preds_df['id'] = test_id
    test_preds_df['target'] = test_preds_final
    return X_test_preds_df, test_preds_df

In [None]:
xgboost_valid_df, xgboost_test_df = get_valid_test_preds('xgboost', xgboost_best_params)
xgboost_valid_df.shape, xgboost_test_df.shape

# LGBM model build

In [None]:
def objective(trial):
    """
    Objective function to tune LGBM classifier
    """
    params = {
        'objective': 'binary',
        'device': 'gpu',
        'metric': 'auc',
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 4, 15),
        'num_leaves': trial.suggest_int('num_leaves', 20, 200),
        'subsample': trial.suggest_float('subsample', 0.2, 0.95),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 0.95)
    }
    
    # KFold split
    skf = StratifiedKFold(n_splits=5, random_state=13, shuffle=True)
    cv_scores = []
    
    for train_ix, test_ix in skf.split(X_train, y_train):
        X_train_k, X_test_k = X_train.iloc[train_ix], X_train.iloc[test_ix]
        y_train_k, y_test_k = y_train.iloc[train_ix], y_train.iloc[test_ix]    
        booster = LGBMClassifier(**params)
        booster.fit(X_train_k, y_train_k, eval_metric='auc', early_stopping_rounds=30, eval_set=[(X_test_k, y_test_k)], verbose=0)
        preds = booster.predict_proba(X_test)
        preds = preds[:, 1]
        cv_scores.append(preds)
    cv_score = np.mean(cv_scores, axis=0)
    return roc_auc_score(y_test, cv_score)

In [None]:
study = optuna.create_study(direction='maximize', study_name='lgbm')
study.optimize(objective, n_trials=50, timeout=600)

In [None]:
print(f'Best study score: {study.best_value}')
print(f'Best params: {study.best_params}')
lgbm_best_params = study.best_params

In [None]:
lgbm_valid_df, lgbm_test_df = get_valid_test_preds('lightgbm', lgbm_best_params)
lgbm_valid_df.shape, lgbm_test_df.shape

# CatBoost model build

In [None]:
def objective(trial):
    params = {
        'objective': 'Logloss',
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.01, .1),
        'depth': trial.suggest_int('depth', 1, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'iterations': trial.suggest_int('iterations', 50, 300),
    }
    
    # KFold split
    skf = StratifiedKFold(n_splits=5, random_state=13, shuffle=True)
    cv_scores = []
    
    for train_ix, test_ix in skf.split(X_train, y_train):
        X_train_k, X_test_k = X_train.iloc[train_ix], X_train.iloc[test_ix]
        y_train_k, y_test_k = y_train.iloc[train_ix], y_train.iloc[test_ix]
    
        booster = CatBoostClassifier(**params, eval_metric='AUC')
        booster.fit(X_train_k, y_train_k, eval_set=[(X_test_k, y_test_k)], verbose=0)
        preds = booster.predict_proba(X_test)
        preds = preds[:, 1]
        cv_scores.append(preds)
    cv_score = np.mean(cv_scores, axis=0)
    return roc_auc_score(y_test, cv_score)

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, timeout=600)

In [None]:
print(f'Best score: {study.best_value}')
print(f'Best params: {study.best_params}')
catboost_best_params = study.best_params

In [None]:
cb_valid_df, cb_test_df = get_valid_test_preds('catboost', catboost_best_params)
cb_valid_df.shape, cb_test_df.shape

# Concatenate the predictions from xgboost, lgbm, catboost

In [None]:
xgboost_valid_df.head()

In [None]:
# concatenate valid features
valid_features_df = pd.merge(xgboost_valid_df, lgbm_valid_df, on='id', how='inner')
valid_features_df = pd.merge(valid_features_df, cb_valid_df, on='id', how='inner', sort='id')
valid_features_df.columns = ['id', 'xgb', 'lgbm', 'cb']
valid_features_df.head()

In [None]:
# concatenate test features
test_features_df = pd.merge(xgboost_test_df, lgbm_test_df, on='id', how='inner')
test_features_df = pd.merge(test_features_df, cb_test_df, on='id', how='inner', sort='id')
test_features_df.columns = ['id', 'xgb', 'lgbm', 'cb']
test_features_df.head()

## Meta model building

In [None]:
def objective(trial):
    """
    Objective function to tune LGBM classifier
    """
    params = {
        'objective': 'binary',
        'device': 'gpu',
        'metric': 'auc',
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 4, 15),
        'num_leaves': trial.suggest_int('num_leaves', 20, 200),
        'subsample': trial.suggest_float('subsample', 0.2, 0.95),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 0.95)
    }
    
    # KFold split
    skf = StratifiedKFold(n_splits=5, random_state=13, shuffle=True)
    cv_scores = []
    
    X_train, X_test, y_train, y_test = train_test_split(valid_features_df.drop(['id'], axis=1), train_target, test_size=0.25,
                                                    stratify=train_target, random_state=13)
    
    for train_ix, test_ix in skf.split(X_train, y_train):
        X_train_k, X_test_k = X_train.iloc[train_ix], X_train.iloc[test_ix]
        y_train_k, y_test_k = y_train.iloc[train_ix], y_train.iloc[test_ix]  
        booster = LGBMClassifier(**params)
        booster.fit(X_train_k, y_train_k, eval_metric='auc', early_stopping_rounds=30, eval_set=[(X_test_k, y_test_k)], verbose=0)
        preds = booster.predict_proba(X_test)
        preds = preds[:, 1]
        cv_scores.append(preds)
    cv_score = np.mean(cv_scores, axis=0)
    return roc_auc_score(y_test, cv_score)

In [None]:
study = optuna.create_study(direction='maximize', study_name='lgbm_meta_model')
study.optimize(objective, n_trials=50, timeout=600)

In [None]:
best_meta_params = study.best_params
best_meta_params

In [None]:
test_df_preds = []
skf = StratifiedKFold(n_splits=5, random_state=13, shuffle=True)
for train_ix, test_ix in skf.split(valid_features_df, train_target):
    X_train, X_test = valid_features_df.drop('id', axis=1).iloc[train_ix], valid_features_df.drop('id', axis=1).iloc[test_ix]
    y_train, y_test = train_target.iloc[train_ix], train_target.iloc[test_ix]
    # meta model build
    booster = LGBMClassifier(**best_meta_params, device='gpu')
    booster.fit(X_train, y_train, verbose=0, eval_set=[(X_test, y_test)], eval_metric='auc')
    print(booster.best_score_)
    test_preds = booster.predict_proba(test_features_df.drop('id', axis=1))
    test_preds = test_preds[:, 1]
    test_df_preds.append(test_preds)

In [None]:
submission_df.target = np.mean(test_df_preds, axis=0)
submission_df.to_csv('submission.csv', index=False)