#  TPS September 2021 - Blend LGB/XGB/CB/HGB/Voting

## Import libraries

In [None]:
%%time

import os
import logging
import sys
import warnings
warnings.simplefilter("ignore")

import numpy as np
import pandas as pd
import matplotlib as plt

import plotly.figure_factory as ff
import plotly.express as px

from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

## Load datasets

In [None]:
%%time

folds_dir = "../input/tps-september-2021-skfolds/"
data_dir = "../input/tabular-playground-series-sep-2021/"

df_train = pd.read_csv(folds_dir + "train_folds.csv")
df_test = pd.read_csv(data_dir + "test.csv")
submission = pd.read_csv(data_dir + "sample_solution.csv")

TARGET = "claim"

## Merge preds to main data-frames

In [None]:
%%time

preds_dir = "../input/tps-september-2021-preds/"

preds_train_files = [    
    "lgb_train_3.csv",
    "xgb_train_3.csv",
    "cb_train_3.csv",
    #"hgb_train_3.csv",
    
    "lgb_train_4.csv",
    "xgb_train_4.csv",
    "cb_train_4.csv",
    "hgb_train_4.csv",
    
    "vote_train_1.csv",
    "vote_train_1_1.csv",
    "vote_train_2.csv",
    
    "lgb1_train_5.csv",
    "lgb2_train_5.csv",
    "lgb3_train_5.csv",
    "lgb4_train_5.csv",
]

preds_test_files = [
    "lgb_test_3.csv",
    "xgb_test_3.csv",
    "cb_test_3.csv",
    #"hgb_test_3.csv",
    
    "lgb_test_4.csv",
    "xgb_test_4.csv",
    "cb_test_4.csv",
    "hgb_test_4.csv",
    
    "vote_test_1.csv",
    "vote_test_1_1.csv",
    "vote_test_2.csv",
    
    "lgb1_test_5.csv",
    "lgb2_test_5.csv",
    "lgb3_test_5.csv",
    "lgb4_test_5.csv",
]

features = [    
    "lgb_pred_3", 
    "xgb_pred_3", 
    "cb_pred_3", 
    #"hgb_pred_3", 
    
    "lgb_pred_4",
    "xgb_pred_4",
    "cb_pred_4",
    "hgb_pred_4",
    
    "vote_pred_1",
    "vote_pred_1_1",
    "vote_pred_2",
    
    "lgb1_pred_5",
    "lgb2_pred_5",
    "lgb3_pred_5",
    "lgb4_pred_5",
]


def merge_preds_df(df, preds_files):
    for preds_file in preds_files:
        df_tmp = pd.read_csv(preds_dir + preds_file)
        df = df.merge(df_tmp, on="id", how="left")
    return df

df_train = merge_preds_df(df_train, preds_train_files)
df_test = merge_preds_df(df_test, preds_test_files)

df_test = df_test[features]

## Check for correlations

In [None]:
def load_vis_data(preds_files):
    # Read test preds files
    df_list = []
    for preds_file in preds_files:
        df_tmp = pd.read_csv(f"{preds_dir}/{preds_file}")
        df_tmp.columns = ["id", "claim"]
        df_list.append((preds_file, df_tmp))
        
    hist_data = []
    for i in range(len(df_list)):
        _, df = df_list[i]
        hist_data.append(df[TARGET])
        
    return hist_data

In [None]:
%%time

vis_data = load_vis_data(preds_test_files)

In [None]:
%%time

fig1 = ff.create_distplot(vis_data, preds_test_files, bin_size=0.3, show_hist=False, show_rug=False)
fig1.show()

In [None]:
%%time

fig2 = px.imshow(np.corrcoef(vis_data), x=preds_test_files, y=preds_test_files)
fig2.show()

## Predict

In [None]:
def predict(model, df_train, df_test, folds=5):
    test_preds = []
    valid_preds = {}
    scores = []
    
    for fold in range(folds):
        x_train = df_train[df_train.kfold != fold].reset_index(drop=True)
        x_valid = df_train[df_train.kfold == fold].reset_index(drop=True)
        x_test = df_test.copy()
        
        valid_ids = x_valid.id.values.tolist()

        y_train = x_train[TARGET]
        y_valid = x_valid[TARGET]

        x_train = x_train[features]
        x_valid = x_valid[features]

        model.fit(x_train, y_train)
        
        valid_pred = model.predict_proba(x_valid)[:, 1]
        test_pred = model.predict_proba(x_test)[:, 1]
        
        test_preds.append(test_pred)
        valid_preds.update(dict(zip(valid_ids, valid_pred)))

        score = roc_auc_score(y_valid, valid_pred)
        print(f"Fold {fold} | AUC: {score}")
        scores.append(score)
    
    test_preds = np.mean(np.column_stack(test_preds), axis=1)
    valid_preds = pd.DataFrame.from_dict(valid_preds, orient="index").reset_index()
    
    return test_preds, valid_preds, scores

In [None]:
lr1_params = {
    "C": 2.1434537945516228, 
    "random_state": 0, 
    "solver": "saga"
}

lr2_params = {
    "C": 2.4622964608689104, 
    "fit_intercept": True, 
    "random_state": 42, 
    "solver": "sag", 
    "tol": 0.009995426310967661
}

lr3_params = {
    "C": 2.106528532902824, 
    "fit_intercept": True, 
    "random_state": 88, 
    "solver": "sag", 
    "tol": 0.00809989768012382
}

lr4_params = {
    "C": 2.752343403147011, 
    "fit_intercept": True, 
    "random_state": 1, 
    "solver": "sag", 
    "tol": 0.009277692817264553
}

lr5_params = {
    "C": 2.577491520675309, 
    "fit_intercept": True, 
    "random_state": 100, 
    "solver": "sag", 
    "tol": 0.0076982708195162605
}

models = [
    ("lr1", LogisticRegression(**lr1_params)),
    ("lr2", LogisticRegression(**lr2_params)),
    ("lr3", LogisticRegression(**lr3_params)),
    ("lr4", LogisticRegression(**lr4_params)),
    ("lr5", LogisticRegression(**lr5_params)),
]

In [None]:
%%time

for name, model in models:
    print(f"Using {name}...")
    test_preds, valid_preds, scores = predict(model, df_train.copy(), df_test.copy())
    print(np.mean(scores), np.std(scores))

    # Save predictions
    print("Saving submission files...")
    pred_col =  f"{name}_pred_2"
    valid_preds.columns = ["id", pred_col]
    valid_preds.to_csv(f"{name}_train_2.csv", index=False)

    test_preds_df = pd.DataFrame({"id": submission.id, pred_col: test_preds})
    test_preds_df.to_csv(f"{name}_test_2.csv", index=False)

    sub = pd.DataFrame({"id": submission.id, "claim": test_preds})
    sub.to_csv(f"{name}_submission_2.csv", index=False)
    print("Done.")

    print("-" * 50)