In [None]:
class CFG:
    n_fold = 5
    target_col = "Transported"
    cv_strategy = "kfold"
    seed = 2022
    
    model = "LGB"
    model_params = {
            'n_estimators' : 10000,
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'max_depth': -1,
            'num_leaves' :31,
            'min_child_weight': 1,
            'learning_rate': 0.01,
            'random_state': 42,
            'colsample_bytree': 0.8,
            'importance_type': 'gain'
                    }

    fit_params=dict(
        early_stopping_rounds=100, 
        verbose=100, 
        eval_metric="auc")
    save_model = False

In [None]:
import os
import random
import joblib
import warnings

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedGroupKFold, KFold
from sklearn.metrics import roc_auc_score

import tensorflow as tf
import category_encoders as ce

from matplotlib_venn import venn2
from xgboost import XGBModel
from lightgbm import LGBMModel

In [None]:
def setup(CFG):
    CFG.INPUT = "../input/spaceship-titanic"
    CFG.OUTPUT = "."
    CFG.OUTPUT_MODELS = f"{CFG.OUTPUT}/models"
    CFG.OUTPUT_PREDS = f"{CFG.OUTPUT}/preds"
    CFG.OUTPUT_FIGS = f"{CFG.OUTPUT}/figs"
    CFG.SUBMISSION = f"{CFG.OUTPUT}"
    
    dirs = [CFG.OUTPUT_MODELS, CFG.OUTPUT_PREDS, CFG.OUTPUT_FIGS]
    for d in dirs:
        os.makedirs(d, exist_ok=True)
    
    # load raw data
    load_raw_data(CFG)
    
    # setup
    warnings.filterwarnings("ignore")
        
        
def load_raw_data(CFG):
    CFG.train_df = pd.read_csv(f"{CFG.INPUT}/train.csv")
    CFG.test_df = pd.read_csv(f"{CFG.INPUT}/test.csv")
    CFG.sample_submission_df = pd.read_csv(f"{CFG.INPUT}/sample_submission.csv")

setup(CFG)
display(CFG.train_df)

## Utils

In [None]:
def seed_everything(seed=2021):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)


class Util:
    @classmethod
    def dump(cls, value, path):
        os.makedirs(os.path.dirname(path), exist_ok=True)
        joblib.dump(value, path, compress=True)

    @classmethod
    def load(cls, path):
        return joblib.load(path)

    
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        if verbose:
            print(col)
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'
              .format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df



def plot_intersection(left, right, column, set_labels, ax=None):
    left_set = set(left[column])
    right_set = set(right[column])
    venn2(subsets=(left_set, right_set), set_labels=set_labels, ax=ax)
    return ax


def plot_right_left_intersection(train_df, test_df, columns='__all__'):
    if columns == '__all__':
        columns = set(train_df.columns) & set(test_df.columns)

    columns = list(columns)
    nfigs = len(columns)
    ncols = 6
    nrows = - (- nfigs // ncols)
    fig, axes = plt.subplots(figsize=(3 * ncols, 3 * nrows), ncols=ncols, nrows=nrows)
    axes = np.ravel(axes)
    for c, ax in zip(columns, axes):
        plot_intersection(train_df, test_df, column=c, set_labels=('Train', 'Test'), ax=ax)
        ax.set_title(c)
    return fig

# plot_intersection (object)
def plot_intersection_object(CFG):
    cols = CFG.train_df.dtypes[CFG.train_df.dtypes=="object"].index.tolist()
    fig = plot_right_left_intersection(CFG.train_df, CFG.test_df, columns=cols)
    return fig

In [None]:
# =========================================
#  EDA
# =========================================
fig = plot_intersection_object(CFG)

In [None]:
# =========================================
#  CV
# =========================================
def fold(CFG):
    
    CFG.train_df["fold"] = -1
    CFG.test_df["fold"] = -1
    
    if CFG.cv_strategy == "group":
        cv = StratifiedGroupKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
        for i_fold, (trn_idx, va_idx) in enumerate(cv.split(CFG.train_df, 
                                                            CFG.train_df[CFG.target_col], 
                                                            groups=CFG.train_df[CFG.groups])):
            CFG.train_df.loc[va_idx, "fold"] = i_fold
        
    else:
        cv = KFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
        for i_fold, (trn_idx, va_idx) in enumerate(cv.split(CFG.train_df, 
                                                            CFG.train_df[CFG.target_col])):
            CFG.train_df.loc[va_idx, "fold"] = i_fold
    

In [None]:
# =========================================
#  FE
# =========================================
def symmetric_difference2nan(train_df, test_df, cols):
    for col in cols:
        vals = list(set(train_df[col].unique()) ^ set(test_df[col].unique()))
        print(f"{col}... #{len(vals)} to Null")
        train_df[train_df[col].isin(vals)] = np.nan
        test_df[test_df[col].isin(vals)] = np.nan
    
    return train_df, test_df


def target_encoding_cv(train_df, test_df, target_col, fold_col, cat_cols, cat_encoder=ce.TargetEncoder()):
    """
    CatBoostEncoder
    QuantileEncoder
    JamesSteinEncoder
    etc
    """
    n_fold = train_df[fold_col].nunique()
    
    out_train_df = pd.DataFrame()
    out_test_lst = []
    for i_fold in range(n_fold):
        val_mask = train_df[fold_col] == i_fold
        trn_df = train_df[~val_mask].reset_index(drop=True)
        val_df = train_df[val_mask]
        val_idx = val_df.index.tolist()
        
        cat_encoder.fit(X=trn_df[cat_cols], y=trn_df[target_col])
        te_val_df = cat_encoder.transform(X=val_df[cat_cols]).add_prefix("TE=")
        te_val_df.index = val_idx
        out_train_df = pd.concat([out_train_df, te_val_df])
        
        te_test_vals= cat_encoder.transform(X=test_df[cat_cols]).values
        out_test_lst.append(te_test_vals)
    
    out_train_df = out_train_df.sort_index()
    out_test_df = pd.DataFrame(np.mean(out_test_lst, axis=0), columns=cat_cols).add_prefix("TE=")
    return out_train_df, out_test_df


def concat_out(input_df, output_df):
    return pd.concat([input_df, output_df], axis=1)


def aggregation(input_df, group_key, group_values, agg_methods):
    """ref:https://github.com/pfnet-research/xfeat/blob/master/xfeat/helper.py"""
    new_df = []
    for agg_method in agg_methods:
        for col in group_values:
            if callable(agg_method):
                agg_method_name = agg_method.__name__
            else:
                agg_method_name = agg_method
            new_col = f"agg_{agg_method_name}_{col}_grpby_{group_key}"
            df_agg = (input_df[[col] + [group_key]].groupby(group_key)[[col]].agg(agg_method))
            df_agg.columns = [new_col]
            new_df.append(df_agg)

    _df = pd.concat(new_df, axis=1).reset_index()
    output_df = pd.merge(input_df[[group_key]], _df, on=group_key, how="left")
    return output_df.drop(group_key, axis=1)

    
def fe(CFG):
    output_df = pd.DataFrame()
    input_df = pd.concat([CFG.train_df, CFG.test_df]).reset_index(drop=True)
    print("# ----------- # change type # ----------- #")
    input_df[CFG.target_col] = input_df[CFG.target_col].astype(float)
    
    
    print("# ----------- # not features # ----------- #")
    cols = ["fold",  CFG.target_col]
    output_df = concat_out(input_df[cols], output_df)
    
    
    print("# ----------- # raw features # ----------- #")
    cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
    output_df = concat_out(input_df[cols], output_df)
    
    
    print("# ----------- # ordinal encode # ----------- #")
    cols = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
    _df = ce.OrdinalEncoder().fit_transform(input_df[cols])
    output_df = concat_out(_df, output_df)
    
    
    print("# ----------- # agg features # ----------- #")
    group_keys = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
    group_values = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
    agg_methods = ["min", "mean", "max"]
    _df = pd.DataFrame()
    for group_key in group_keys:
        agg_df = aggregation(input_df, group_key, group_values, agg_methods)
        _df = pd.concat([_df, agg_df], axis=1)
    output_df = concat_out(_df, output_df)
    
    
    print("# ----------- # target encoding # ----------- #")
    fold_col = "fold"
    cat_cols = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
    cat_encoder = ce.TargetEncoder()
    _train_df, _test_df = target_encoding_cv(CFG.train_df, CFG.test_df, CFG.target_col, fold_col, cat_cols, cat_encoder)
    _df = pd.concat([_train_df, _test_df]).reset_index(drop=True)
    output_df = concat_out(_df, output_df)
    display(output_df)
    
    print("# ----------- # finish fe # ----------- #")
    out_train_df = output_df.iloc[:len(CFG.train_df)]
    out_test_df = output_df.iloc[len(CFG.train_df):].reset_index(drop=True)
    return out_train_df, out_test_df

In [None]:
fold(CFG)
train_df, test_df = fe(CFG)

## Metrics

In [None]:
def get_score(y_true, y_pred):
    return roc_auc_score(y_true, y_pred)

## Train & Predict

In [None]:
def get_model(CFG):
    if CFG.model == "XGB":
        return XGBModel
    
    elif CFG.model == "LGB":
        return LGBMModel
    
    else:
        raise NotImplementedError


def train_cv(CFG, df, no_feature_cols=["fold", CFG.target_col]):
    
    oof_df = pd.DataFrame()
    oof = np.zeros(len(df))
    
    feature_cols = [x for x in df.columns if x not in no_feature_cols]
    X = df[feature_cols]
    y = df[CFG.target_col]

    models = []
    for i_fold in range(CFG.n_fold):

        filepath = f"{CFG.OUTPUT_MODELS}/XGB-Fold{i_fold}.pkl"
        val_mask = (df["fold"] == i_fold).astype(bool)
        tr_x, tr_y = X[~val_mask].reset_index(drop=True), y[~val_mask].reset_index(drop=True)
        va_x, va_y = X[val_mask].reset_index(drop=True), y[val_mask].reset_index(drop=True)

        if not os.path.isfile(filepath):    
            model = get_model(CFG)(**CFG.model_params)
            model.fit(tr_x, tr_y, eval_set=[(va_x, va_y)], **CFG.fit_params)
            if CFG.save_model:
                Util.dump(model, filepath)
        if CFG.save_model:
            model = Util.load(filepath)
            
        preds = model.predict(va_x)
        models.append(model)

        # get score
        score = get_score(va_y, preds)
        print(f"target:{CFG.target_col}_fold{i_fold}={score:.5f}")
        oof[val_mask] = preds

    oof_df[CFG.target_col] = oof
    # save fold preds
    oof_df.to_csv(CFG.OUTPUT_PREDS + "/raw_oof_df.csv", index=False)

    # get score
    score = get_score(y, oof)
    print(f"target:{CFG.target_col}={score:.5f}")

    return oof_df, models


def predict_cv(CFG, df, models=None, no_feature_cols=["fold", CFG.target_col]):

    feature_cols = [x for x in df.columns if x not in no_feature_cols]
    X = df[feature_cols]
    y = df[CFG.target_col]

    preds_df = pd.DataFrame()
    fold_preds = []
    for i_fold in range(CFG.n_fold):
        if models is None:
            filepath = f"{CFG.EXP_MODEL}/XGB-Fold{i_fold}.pkl"
            model = Util.load(filepath)
        else:
            model =models[i_fold]
            
        preds = model.predict(X)
        fold_preds.append(preds)

        # save fold preds
        Util.dump(preds, CFG.OUTPUT_PREDS + f"/raw_preds_target{CFG.target_col}_fold{i_fold}.pkl")
        
    preds_df[CFG.target_col] = np.mean(fold_preds, axis=0)
    preds_df.to_csv(CFG.OUTPUT_PREDS + "/raw_preds_df.csv", index=False)
    return preds_df


In [None]:
oof_df, models = train_cv(CFG, df=train_df, no_feature_cols=["fold", CFG.target_col])
preds_df = predict_cv(CFG, df=test_df, models=models, no_feature_cols=["fold", CFG.target_col])           

## Importance

In [None]:
def visualize_importance(models, feat_train_df, no_feature_cols=["fold", CFG.target_col]):
    """lightGBM の model 配列の feature importance を plot する
    CVごとのブレを boxen plot として表現します.

    args:
        models:
            List of lightGBM models
        feat_train_df:
            学習時に使った DataFrame
    """
    feature_importance_df = pd.DataFrame()
    
    feature_cols = [x for x in feat_train_df.columns if x not in no_feature_cols]
    for i, model in enumerate(models):
        _df = pd.DataFrame()
        _df["feature_importance"] = model.feature_importances_
        _df["column"] = feature_cols
        _df["fold"] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, _df], 
                                          axis=0, ignore_index=True)

    order = feature_importance_df.groupby("column")\
        .sum()[["feature_importance"]]\
        .sort_values("feature_importance", ascending=False).index[:50]

    fig, ax = plt.subplots(figsize=(8, max(6, len(order) * .25)))
    sns.boxenplot(data=feature_importance_df, 
                  x="feature_importance", 
                  y="column", 
                  order=order, 
                  ax=ax, 
                  palette="viridis", 
                  orient="h")
    ax.tick_params(axis="x", rotation=90)
    ax.set_title("Importance")
    ax.grid()
    fig.tight_layout()
    return fig, ax

fig, ax = visualize_importance(models, train_df)

## Submission

In [None]:
sub = CFG.sample_submission_df.copy()
sub[CFG.target_col] = (preds_df[CFG.target_col] >= 0.5).astype(int)
display(sub)
sub.to_csv("submission.csv", index=False)