## TPS Nov. 2021 - Baseline blend XGBM/LGBM/CB/HGB

## Import libraries

In [None]:
%%time

import os
import logging
import sys
import time
from datetime import timedelta

import warnings
warnings.simplefilter("ignore")

import gc
gc.enable()

import numpy as np
import pandas as pd

import plotly.figure_factory as ff
import plotly.express as px

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

## Load datasets

In [None]:
%%time

data_dir = "../input/tabular-playground-series-nov-2021/"

train  = pd.read_csv(data_dir  + "train.csv")
test = pd.read_csv(data_dir + "test.csv")
submission = pd.read_csv(data_dir + "sample_submission.csv")

In [None]:
%%time

print("Train shape: ", train.shape)
print("Test shape: ", test.shape, end="\n\n")

## Merge predictions to main data-frame

In [None]:
def merge_preds_df(df, preds_files):
    for preds_file in preds_files:
        df_tmp = pd.read_csv(preds_dir + preds_file)
        df = df.merge(df_tmp, on="id", how="left")
    return df

def build_preds_file_names(preds):
    test_files = list()
    train_files = list()
    
    for pred in preds:
        test_files.append(f"{pred}_test.csv")
        train_files.append(f"{pred}_train.csv")
        
    return (test_files, train_files)

In [None]:
%%time

preds_dir = "../input/tps-november-2021-predictions/"

TARGET = "target"

features = [
    "cb1", "hgb1", "lgb1", "xgb1",
]

preds_test_files, preds_train_files = build_preds_file_names(features)

test = merge_preds_df(test, preds_test_files)
train = merge_preds_df(train, preds_train_files)

In [None]:
train[features].head()

In [None]:
test[features].head()

## Visualize correlations

In [None]:
%%time

def load_vis_data(preds_files):
    # Read test preds files
    df_list = []
    for preds_file in preds_files:
        df_tmp = pd.read_csv(f"{preds_dir}/{preds_file}")
        # rename preds column 
        df_tmp.columns = ["id", TARGET]
        df_list.append((preds_file, df_tmp))
        
    hist_data = []
    for i in range(len(df_list)):
        _, df = df_list[i]
        hist_data.append(df[TARGET])
        
    return hist_data

In [None]:
%%time

vis_data = load_vis_data(preds_test_files)

In [None]:
%%time

fig1= ff.create_distplot(vis_data, preds_test_files, bin_size=0.3, show_hist=False, show_rug=False)
fig1.show()

In [None]:
%%time

fig2 = px.imshow(np.corrcoef(vis_data), x=preds_test_files, y=preds_test_files)
fig2.show()

## Prepare

In [None]:
%%time

y = train.pop(TARGET)
X = train[["id"] + features].copy()
X_test = test[features].copy()

del train
del test

## Predict

In [None]:
%%time

def predict_with_model(model, simple_fit=False, splits=5):
    test_preds = []
    valid_preds = {}
    scores = []
    
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
    for fold, (idx_train, idx_valid) in enumerate(skf.split(X, y)):
        start_time = time.monotonic()
        
        X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
        
        valid_ids = X_valid.id.values.tolist()

        X_train = X_train[features]
        X_valid = X_valid[features]

        if simple_fit:
            model.fit(X_train, y_train)
        else:
            model.fit(
                X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                early_stopping_rounds=180,
                verbose=1000
            )
        
        valid_pred = model.predict_proba(X_valid)[:, 1]
        test_pred = model.predict_proba(X_test)[:, 1]
        
        test_preds.append(test_pred)
        valid_preds.update(dict(zip(valid_ids, valid_pred)))

        score = roc_auc_score(y_valid, valid_pred)
        
        end_time = time.monotonic()
        dur = timedelta(seconds=end_time - start_time)
        print(f"Fold {fold} | AUC: {score} | Took: {dur}")
        scores.append(score)
    
    test_preds = np.mean(np.column_stack(test_preds), axis=1)
    valid_preds = pd.DataFrame.from_dict(valid_preds, orient="index").reset_index()
    
    return test_preds, valid_preds, scores

In [None]:
%%time

def predict_with_models(models):
    print(f"Predicting with {len(models)} models...", end="\n\n")
    for model_name, model in models:
        start_time = time.monotonic()
        
        # simple fit for sklearn models
        simple_fit = True
        
        print("-" * 50)
        print(f"Using {model_name} model...")
        test_preds, valid_preds, scores = predict_with_model(model, simple_fit=simple_fit)
        print(f"Score: {np.mean(scores)}, Std: {np.std(scores)}", end="\n\n")

        print("Saving predictions...")
        valid_preds.columns = ["id", model_name]
        valid_preds.to_csv(f"{model_name}_train.csv", index=False)

        test_preds_df = pd.DataFrame({"id": submission.id, model_name: test_preds})
        test_preds_df.to_csv(f"{model_name}_test.csv", index=False)

        sub = pd.DataFrame({"id": submission.id, TARGET: test_preds})
        sub.to_csv(f"{model_name}_submission.csv", index=False)
        
        end_time = time.monotonic()
        dur = timedelta(seconds=end_time - start_time)
        print(f"Took: {dur}")

In [None]:
%%time

SEED = 42

lr1_params = {
    "random_state": SEED, 
    "solver": "saga"
}

lr2_params = {
    "random_state": SEED, 
    "fit_intercept": True, 
    "solver": "sag", 
}

gnb1_params = {}

# Model name must be unique
models = [
    ("lr1", LogisticRegression(**lr1_params)),
    ("lr2", LogisticRegression(**lr2_params)),
    ("gnb1", GaussianNB(**gnb1_params)),
]

In [None]:
%%time

predict_with_models(models)