## ðŸ‡µðŸ‡­ TPS Oct. 2021 - Pinoy blends ðŸ”¥

## Import libraries

In [None]:
%%time

import os
import logging
import sys
import time
from datetime import timedelta

import warnings
warnings.simplefilter("ignore")

import gc
gc.enable()

import numpy as np
import pandas as pd

import plotly.figure_factory as ff
import plotly.express as px

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

## Load datasets

In [None]:
%%time

# Refer to https://www.kaggle.com/bextuychiev/how-to-work-w-million-row-datasets-like-a-pro

def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max):
                    df[col] = df[col].astype(np.float16)
                elif (c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print("Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)". 
              format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
%%time

data_dir = "../input/tabular-playground-series-oct-2021/"

train  = reduce_memory_usage(pd.read_csv(data_dir  + "train.csv"))
test = reduce_memory_usage(pd.read_csv(data_dir + "test.csv"))
submission = reduce_memory_usage(pd.read_csv(data_dir + "sample_submission.csv"))

In [None]:
%%time

print("Train shape: ", train.shape)
print("Test shape: ", test.shape, end="\n\n")

## Merge predictions to main data frames

In [None]:
def merge_preds_df(df, preds_files):
    for preds_file in preds_files:
        df_tmp = pd.read_csv(preds_dir + preds_file)
        df = df.merge(df_tmp, on="id", how="left")
    return df

def build_preds_file_names(preds):
    test_files = list()
    train_files = list()
    
    for pred in preds:
        test_files.append(f"{pred}_test.csv")
        train_files.append(f"{pred}_train.csv")
        
    return (test_files, train_files)

In [None]:
%%time

preds_dir = "../input/tps-october-2021-predictions/"

TARGET = "target"

features = [
    #"cb1", "hgb1", "lgb1", "vote1",
    
    "cb2", "hgb2", "lgb2",
    
    #"cb3", "hgb3", "lgb3",
    
    #"cb4", "hgb4", "lgb4",
    
    #"cb5", "hgb5", "lgb5",
    
    "cb6", "hgb6", "lgb6",
    
    #"cb7", "hgb7", "lgb7",
    
    "cb8", "hgb8", "lgb8",
]

preds_test_files, preds_train_files = build_preds_file_names(features)

test = merge_preds_df(test, preds_test_files)
train = merge_preds_df(train, preds_train_files)

In [None]:
train[features].head()

In [None]:
test[features].head()

## Visualize correlations

In [None]:
%%time

def load_vis_data(preds_files):
    # Read test preds files
    df_list = []
    for preds_file in preds_files:
        df_tmp = pd.read_csv(f"{preds_dir}/{preds_file}")
        # rename preds column 
        df_tmp.columns = ["id", TARGET]
        df_list.append((preds_file, df_tmp))
        
    hist_data = []
    for i in range(len(df_list)):
        _, df = df_list[i]
        hist_data.append(df[TARGET])
        
    return hist_data

In [None]:
%%time

vis_data = load_vis_data(preds_test_files)

In [None]:
%%time

fig1= ff.create_distplot(vis_data, preds_test_files, bin_size=0.3, show_hist=False, show_rug=False)
fig1.show()

In [None]:
%%time

fig2 = px.imshow(np.corrcoef(vis_data), x=preds_test_files, y=preds_test_files)
fig2.show()

In [None]:
%%time

y = train.pop(TARGET)
X = train[["id"] + features].copy()
X_test = test[features].copy()

del train
del test

## Predict

In [None]:
%%time

lr1_params = {
    "random_state": 42, 
    "solver": "saga"
}

lr2_params = {
    "fit_intercept": True, 
    "random_state": 42, 
    "solver": "sag", 
}

gnb1_params = {}

# Model name must be unique
models = [
    ("lr1", LogisticRegression(**lr1_params)),
    ("lr2", LogisticRegression(**lr2_params)),
    ("gnb1", GaussianNB(**gnb1_params)),
]

In [None]:
%%time

def predict_with_model(model, simple_fit=False, splits=5):
    test_preds = []
    valid_preds = {}
    scores = []
    selected_cols = []
    
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
    for fold, (idx_train, idx_valid) in enumerate(skf.split(X, y)):
        start_time = time.monotonic()
        
        X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
        
        valid_ids = X_valid.id.values.tolist()

        X_train = X_train[features]
        X_valid = X_valid[features]

        if simple_fit:
            model.fit(X_train, y_train)
        else:
            model.fit(
                X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                early_stopping_rounds=180,
                verbose=1000
            )
        
        valid_pred = model.predict_proba(X_valid)[:, 1]
        test_pred = model.predict_proba(X_test)[:, 1]
        
        test_preds.append(test_pred)
        valid_preds.update(dict(zip(valid_ids, valid_pred)))

        score = roc_auc_score(y_valid, valid_pred)
        
        end_time = time.monotonic()
        dur = timedelta(seconds=end_time - start_time)
        print(f"Fold {fold} | AUC: {score} | Took: {dur}")
        scores.append(score)
    
    test_preds = np.mean(np.column_stack(test_preds), axis=1)
    valid_preds = pd.DataFrame.from_dict(valid_preds, orient="index").reset_index()
    
    return test_preds, valid_preds, scores

In [None]:
%%time

def predict_with_models(models):
    print(f"Predicting with {len(models)} models...", end="\n\n")
    for model_name, model in models:
        start_time = time.monotonic()
        
        print("-" * 50)
        print(f"Using {model_name} model...")
        test_preds, valid_preds, scores = predict_with_model(model, simple_fit=True)
        print(f"Score: {np.mean(scores)}, Std: {np.std(scores)}", end="\n\n")

        print("Saving predictions...")
        valid_preds.columns = ["id", model_name]
        valid_preds.to_csv(f"{model_name}_train.csv", index=False)

        test_preds_df = pd.DataFrame({"id": submission.id, model_name: test_preds})
        test_preds_df.to_csv(f"{model_name}_test.csv", index=False)

        sub = pd.DataFrame({"id": submission.id, TARGET: test_preds})
        sub.to_csv(f"{model_name}_submission.csv", index=False)
        
        end_time = time.monotonic()
        dur = timedelta(seconds=end_time - start_time)
        print(f"Took: {dur}")

In [None]:
%%time

predict_with_models(models)