## Config

In [None]:
class Config:

    # inital config
    author : str = "mst8823"
    name : str = "exp001"
    competition : str = "spaceship-titanic"
    debug : bool = False

    # exp config
    target_col = "Transported"
    seed : int = 33
    num_fold : int = 5
    train_fold : list = [0, 1, 2, 3, 4]
    overwrite : bool = False

    # model config
    model_name : str = "LGB"
    model_params : dict = dict(
        objective="binary",
        n_estimators=10000, 
        num_leaves=31, 
        colsample_bytree=.1, 
        learning_rate=0.01,
        importance_type="gain", 
        random_state=seed
        )
    fit_params : dict = dict(
        early_stopping_rounds=300, 
        verbose=100,
        eval_metric="auc"
        )
    
    # colab config
    HOME : str = "/content/drive/MyDrive/spaceship-titanic"
    API : str = "/content/drive/MyDrive/competition/kaggle.json"
    upload_from_colab = False


## Library

In [None]:
import os
import sys
import json
import random
import joblib
import shutil
import logging
import warnings
import datetime
import requests

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

from lightgbm import LGBMModel
from xgboost import XGBModel
import torch

## Setup

In [None]:
def setup(config):
    config.COLAB = "google.colab" in sys.modules
    if config.COLAB:
        print("This environment is Google Colab")

        # mount
        from google.colab import drive
        if not os.path.isdir("/content/drive"):
            drive.mount("/content/drive") 

        # use kaggle api (need kaggle token)
        f = open(config.API, 'r')
        json_data = json.load(f) 
        os.environ["KAGGLE_USERNAME"] = json_data["username"]
        os.environ["KAGGLE_KEY"] = json_data["key"]

        # set dirs
        config.EXP = (config.name if config.name is not None 
            else requests.get("http://172.28.0.2:9000/api/sessions").json()[0]["name"][:-6]
        )
        config.INPUT = os.path.join(config.HOME, "input")
        config.OUTPUT = os.path.join(config.HOME, "output")
        config.SUBMISSION = os.path.join(config.HOME, "submission")

        config.OUTPUT_EXP = os.path.join(config.OUTPUT, config.EXP) 
        config.EXP_MODEL = os.path.join(config.OUTPUT_EXP, "model")
        config.EXP_FIG = os.path.join(config.OUTPUT_EXP, "fig")
        config.EXP_PREDS = os.path.join(config.OUTPUT_EXP, "preds")

        # make dirs
        for d in [
                config.HOME,
                config.INPUT, 
                config.SUBMISSION, 
                config.EXP_MODEL, 
                config.EXP_FIG, 
                config.EXP_PREDS
                ]:
            os.makedirs(d, exist_ok=True)
        
        if not os.path.isfile(os.path.join(config.INPUT, 'sample_submission.csv')):
            # load dataset
            ! pip install --upgrade --force-reinstall --no-deps kaggle
            ! kaggle competitions download -c $config.competition -p $config.INPUT
            filepath = os.path.join(config.INPUT, config.competition+'.zip')
            ! unzip -d $config.INPUT $filepath

    else:
        print("This environment is Kaggle Kernel")

        # set dirs
        config.INPUT = f"../input/{config.competition}"
        config.EXP = config.name
        config.OUTPUT_EXP = config.name
        config.SUBMISSION = "./"
        config.DATASET = "../input/"
        config.EXP_MODEL = os.path.join(config.EXP, "model")
        config.EXP_FIG = os.path.join(config.EXP, "fig")
        config.EXP_PREDS = os.path.join(config.EXP, "preds")

        # make dirs
        for d in [config.EXP_MODEL, config.EXP_FIG, config.EXP_PREDS]:
            os.makedirs(d, exist_ok=True)

    config.logger = Logger(config.OUTPUT_EXP)
    seed_everything(config.seed)
    warnings.filterwarnings("ignore")
    return config


def seed_everything(seed=2022):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def dataset_create_new(dataset_name, upload_dir):
    from kaggle.api.kaggle_api_extended import KaggleApi

    dataset_metadata = {}
    dataset_metadata['id'] = f'{os.environ["KAGGLE_USERNAME"]}/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')


class Logger:
    """ ref) https://github.com/ghmagazine/kagglebook/blob/master/ch04-model-interface/code/util.py"""
    def __init__(self, path):
        self.general_logger = logging.getLogger(path)
        stream_handler = logging.StreamHandler()
        file_general_handler = logging.FileHandler(os.path.join(path, 'Experiment.log'))
        if len(self.general_logger.handlers) == 0:
            self.general_logger.addHandler(stream_handler)
            self.general_logger.addHandler(file_general_handler)
            self.general_logger.setLevel(logging.INFO)

    def info(self, message):
        # display time
        self.general_logger.info('[{}] - {}'.format(self.now_string(), message))

    @staticmethod
    def now_string():
        return str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))


class Util:
    @classmethod
    def dump(cls, value, path):
        os.makedirs(os.path.dirname(path), exist_ok=True)
        joblib.dump(value, path, compress=True)

    @classmethod
    def load(cls, path):
        return joblib.load(path)

## CV & Metrics

In [None]:
def add_fold(config, train_df):
    """get fold number"""
    cv = StratifiedKFold(
        n_splits=config.num_fold, 
        shuffle=True, 
        random_state=config.seed
        )
    train_df["fold"] = -1
    for i_fold, (trn_idx, val_idx) in enumerate(cv.split(train_df, y=train_df[config.target_col])):
        train_df.loc[val_idx, "fold"] =  i_fold
    train_df["fold"] = train_df["fold"].astype(int) 
    return train_df

In [None]:
def get_score(config, y_true, y_pred):
    score = roc_auc_score(y_true=y_true, y_score=y_pred)
    return score

## Preprocess

In [None]:
def get_raw_features(input_df):
    cols = [
        "Age", 
        "RoomService", 
        "FoodCourt", 
        "ShoppingMall", 
        "Spa", 
        "VRDeck"
        ]
    return input_df[cols]


def preprocess(config, raw_train_df, raw_test_df):
    raw_train_df[config.target_col] = raw_train_df[config.target_col].astype(int)

    # concat train & test
    input_df = pd.concat(
        [raw_train_df, raw_test_df]).reset_index(drop=True)

    # select no features
    config.no_feature_cols = [
                    config.target_col, 
                    "fold"
                    ]
    output_df = input_df[config.no_feature_cols]
    
    # select fe funcs
    funcs = [
        get_raw_features
    ]

    # fe
    for func in funcs:
        print(func.__name__)
        _df = func(input_df)
        output_df = pd.concat([output_df, _df], axis=1)

    train_df = output_df.iloc[:len(raw_train_df)]
    test_df = output_df.iloc[len(raw_train_df):].reset_index(drop=True)
    return train_df, test_df

## Train

In [None]:
def get_model(config):
    if config.model_name == "XGB":
        return XGBModel
    
    elif config.model_name == "LGB":
        return LGBMModel
    
    else:
        raise NotImplementedError


def train_cv(config, train_df):
    # oof
    oof = np.zeros(len(train_df))
    models = []
    
    # split X and y
    feature_cols = [x for x in train_df.columns if x not in config.no_feature_cols]
    X = train_df[feature_cols]
    y = train_df[config.target_col]

    for i_fold in range(config.num_fold):
        if i_fold not in config.train_fold:
            continue
        prefix = f"fold{i_fold}"

        # split train val
        val_mask = (train_df["fold"] == i_fold).astype(bool)
        tr_x, tr_y = X[~val_mask].reset_index(drop=True), y[~val_mask].reset_index(drop=True)
        va_x, va_y = X[val_mask].reset_index(drop=True), y[val_mask].reset_index(drop=True)

        # get fit params
        fit_params = config.fit_params.copy()
        if config.model_name in ["LGB", "XGB", "CAT"]:
            fit_params["eval_set"] = [(va_x, va_y)]

        # fit and save
        filepath = f"{config.EXP_MODEL}/{prefix}.pkl"
        if (not os.path.isfile(filepath)) or (config.overwrite):    
            model = get_model(config)(**config.model_params)
            model.fit(tr_x, tr_y, **fit_params)
            Util.dump(model, filepath)
        else:
            model = Util.load(filepath)
        
        # predict
        preds = np.array(model.predict(va_x), dtype=np.float64)
        models.append(model)

        # get score
        score = get_score(config, va_y, preds)
        config.logger.info(f"seed{config.seed}_fold{i_fold}={score:.5f}")
        oof[val_mask] = preds 

        # save fold predictions
        filepath = f"{config.EXP_PREDS}/oof__{prefix}.pkl"
        Util.dump(preds, filepath)

    # save fold preds
    filepath = f"{config.EXP_PREDS}/oof.pkl"
    Util.dump(oof, filepath)

    # get score
    score = get_score(config, y, oof)
    config.logger.info(f"target:{config.target_col}={score:.5f}")

    return oof, models

## Predict

In [None]:
def predict_cv(config, test_df, models=None):

    feature_cols = [x for x in test_df.columns if x not in config.no_feature_cols]
    X = test_df[feature_cols]

    fold_preds = []
    for i_fold in range(config.num_fold):
        if i_fold not in config.train_fold:
            continue
        prefix = f"fold{i_fold}"
        if models is None:
            filepath = f"{config.EXP_MODEL}/{prefix}.pkl"
            model = Util.load(filepath)
        else:
            model = models[i_fold]
            
        preds = model.predict(X)
        fold_preds.append(preds)

        # save fold preds
        filepath = f"{config.EXP_PREDS}/preds__{prefix}.pkl"
        Util.dump(preds, filepath)
    
    predictions = np.mean(fold_preds, axis=0)
    filepath = f"{config.EXP_PREDS}/preds.pkl"
    Util.dump(predictions, filepath)
    
    return predictions

In [None]:
def visualize_importance(models, feat_train_df, no_feature_cols):
    """lightGBM の model 配列の feature importance を plot する
    CVごとのブレを boxen plot として表現します.

    args:
        models:
            List of lightGBM models
        feat_train_df:
            学習時に使った DataFrame
    """
    feature_importance_df = pd.DataFrame()
    
    feature_cols = [x for x in feat_train_df.columns if x not in no_feature_cols]
    for i, model in enumerate(models):
        _df = pd.DataFrame()
        _df["feature_importance"] = model.feature_importances_
        _df["column"] = feature_cols
        _df["fold"] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, _df], 
                                          axis=0, ignore_index=True)

    order = feature_importance_df.groupby("column")\
        .sum()[["feature_importance"]]\
        .sort_values("feature_importance", ascending=False).index[:50]

    fig, ax = plt.subplots(figsize=(12, max(6, len(order) * .25)))
    sns.boxenplot(data=feature_importance_df, 
                  x="feature_importance", 
                  y="column", 
                  order=order, 
                  ax=ax, 
                  palette="viridis", 
                  orient="h")
    ax.tick_params(axis="x", rotation=90)
    ax.set_title("Importance")
    ax.grid()
    fig.tight_layout()
    return fig, ax

## Main

In [None]:
# setup
Config = setup(Config)

# get data
raw_train_df = pd.read_csv(Config.INPUT + "/train.csv")
raw_test_df = pd.read_csv(Config.INPUT + "/test.csv")
submission_df = pd.read_csv(Config.INPUT + "/sample_submission.csv")

if Config.debug:
    raw_train_df = raw_train_df.sample(1000).reset_index(drop=True)

# cv split & preprocess
train_df = add_fold(Config, raw_train_df)
train_df, test_df = preprocess(Config, raw_train_df, raw_test_df)

# train
oof, models = train_cv(Config, train_df)

# importanceAA
fig, ax = visualize_importance(models, train_df, no_feature_cols=Config.no_feature_cols)
fig.savefig(Config.EXP_FIG + "/tree_importance.png", dpi=300)

# predict
preds = predict_cv(Config, test_df, models=models)

# make submission
submission_df[Config.target_col] = 1 * (preds >= 0.5)
display(submission_df)
submission_df.to_csv(f"{Config.SUBMISSION}/{Config.name}.csv", index=False)

# upload to kaggle dataset from colab
if Config.upload_from_colab:
    dataset_create_new(dataset_name=Config.EXP, upload_dir=Config.OUTPUT_EXP)