## Import Libraries

In [None]:
# 環境によって処理を変えるためのもの
import sys
import os
IN_COLAB = 'google.colab' in sys.modules
IN_KAGGLE = 'kaggle_web_client' in sys.modules
LOCAL = not (IN_KAGGLE or IN_COLAB)
print(f'IN_COLAB:{IN_COLAB}, IN_KAGGLE:{IN_KAGGLE}, LOCAL:{LOCAL}')

In [None]:
# Hide Warning
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# Python Libraries
import os
import math
import random
import glob
import pickle
import gc
from pathlib import Path

# Third party
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from sklearn import preprocessing
from sklearn.model_selection import GroupKFold,StratifiedKFold
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_absolute_error,roc_auc_score,accuracy_score

import category_encoders as ce
import optuna

# GBDT models
import xgboost as xgb
import lightgbm as lgb
import catboost
from catboost import CatBoost
from catboost import Pool

print(f'xgb:{xgb.__version__}')
print(f'lgb:{lgb.__version__}')
print(f'lgb:{catboost.__version__}')
print(f'optuna:{optuna.__version__}')

def set_seed(seed: int=29):
    print(f'set_seed{seed}')
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
set_seed()

In [None]:
class CFG:
    competition = 'spaceship_titanic'
    seed = 29
    target_col = 'Transported'
    n_fold = 5
    trn_fold = [0,1,2,3,4]
    
    # COL
    cate_cols = []
    cont_cols = []
    feature_cols = []

In [None]:
if IN_KAGGLE:
    INPUT_DIR = Path('../input/spaceship-titanic')
    OUTPUT_DIR = './'
elif IN_COLAB:
    INPUT_DIR = Path('/content/input/')
    OUTPUT_DIR = f'/content/drive/MyDrive/kaggle/spaceship-titanic/{CFG.exp_name}/'
if LOCAL:
    INPUT_DIR = Path("F:/Kaggle/spaceship-titanic/data/input/")
    OUTPUT_DIR = f'F:/Kaggle/pspaceship-titanic/data/output/{CFG.exp_name}/'
    
df_train = pd.read_csv(INPUT_DIR / "train.csv")
df_test = pd.read_csv(INPUT_DIR / "test.csv")
df_sub = pd.read_csv(INPUT_DIR / "sample_submission.csv")
df_oof = df_train.copy()
display(df_train.head())
display(df_test.head())
display(df_sub.head())

In [None]:
df_train.nunique()

In [None]:
CFG.cate_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', ]
CFG.cont_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
# CFG.feature_cols =  CFG.cont_cols  + CFG.cate_cols

## Feature Engneering

In [None]:
# Caategory Encodeing
_OE_COLS = ['HomePlanet', 'CryoSleep', 'Destination','VIP']
OE_COLS = ['OE_' + col for col in _OE_COLS]
ce_oe = ce.OrdinalEncoder(cols=_OE_COLS,handle_missing='return_nan')
df_train[OE_COLS] = ce_oe.fit_transform(df_train[_OE_COLS])
df_test[OE_COLS] = ce_oe.transform(df_test[_OE_COLS])
# CFG.feature_cols += OE_COLS 
print(OE_COLS)

# One-Hot-Encoding
_OHE_COLS = ['HomePlanet', 'CryoSleep', 'Destination','VIP']
ce_ohe = ce.OneHotEncoder(cols=_OHE_COLS, handle_unknown='impute')
_df_ohe = ce_ohe.fit_transform(df_train[_OHE_COLS])
OHE_COLS = _df_ohe.columns.to_list()
df_train[OHE_COLS] = _df_ohe[OHE_COLS]
df_test[OHE_COLS] = ce_ohe.transform(df_test[_OHE_COLS])
# CFG.feature_cols += OHE_COLS 
print(OHE_COLS)

# https://www.kaggle.com/edwintyh/pycaret-spaceship-fe-catboost

## CV Split

In [None]:
print(df_train.Transported.value_counts())
df_train["fold"] = -1
"""
StratifiedKFold
"""
Fold = StratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(df_train, df_train[CFG.target_col])):
    df_train.loc[val_index, 'fold'] = int(n)
    
"""
"""
df_train['fold'] = df_train['fold'].astype(int)
df_oof['fold'] = df_train['fold']
print(df_train.groupby(['fold', CFG.target_col]).size())

## Function

In [None]:
def fit_xgb(cfg, X_train, y_train, X_valid, y_valid,  params: dict=None, verbose_eval=100):
    oof_pred = np.zeros(len(y_valid), dtype=np.float32)
    xgb_train = xgb.DMatrix(X_train, label=y_train)
    xgb_valid = xgb.DMatrix(X_valid, label=y_valid)
    evals = [(xgb_train, 'train'), (xgb_valid, 'eval')]
    
    model = xgb.train(params,
                      xgb_train,
                      evals=evals,
                      verbose_eval=verbose_eval,
                      num_boost_round=10000,
                      early_stopping_rounds=100,
                     )
    
    oof_pred = model.predict(xgb_valid)
    score = roc_auc_score(y_valid,oof_pred)
    return oof_pred, model, score

def fit_lgbm(cfg, X_train, y_train, X_valid, y_valid,  params: dict=None, verbose_eval=100):
    oof_pred = np.zeros(len(y_valid), dtype=np.float32)
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_valid, y_valid)
    
    model = lgb.train(params,
                      lgb_train,
                      #categorical_feature=['statusCode', 'primaryPositionCode'],
                      valid_sets=[lgb_train, lgb_valid],
#                       categorical_feature = cfg.cate_cols,
                      num_boost_round =10000,
                      early_stopping_rounds=100,
                      verbose_eval=verbose_eval,
                     )
    oof_pred = model.predict(X_valid)
    score = roc_auc_score(y_valid, oof_pred)
    return oof_pred, model, score

def fit_cat(cfg, X_train, y_train, X_valid, y_valid,  params: dict=None, verbose_eval=100):
    oof_pred = np.zeros(len(y_valid), dtype=np.float32)
    cat_train = Pool(X_train, y_train,cat_features=cfg.cate_cols)
    cat_valid = Pool(X_valid, y_valid,cat_features=cfg.cate_cols)

    model = CatBoost(params)
    model.fit(cat_train,
              eval_set = [cat_valid],
              use_best_model=True)
    
    oof_pred = model.predict(cat_valid,prediction_type='Probability')[:,1]
    score = roc_auc_score(y_valid,oof_pred)
    return oof_pred, model, score

## Training

In [None]:
xgb_params = {'objective': 'binary:logistic',
              'eval_metric':  'auc',
              'max_depth': 6,
              'max_leaves': 63,
              'alpha': 0.01,  # L1 正則化
              'lambda': 7.1, # L2 正則化
              'colsample_bytree': 0.73,
              'min_child_weight': 4,
              'gamma': 2.12,
              'learning_rate': 0.01,
              'seed': 29,
              'verbosity': 0,
             }

lgb_params = {'objective':'binary',
              'metric':'binary_error',
              'boosting_type': 'gbdt',
              'max_depth' : 5,
              'num_leaves': 40,
              'min_child_samples': 37,
              'lambda_l1': 0.18,
              'lambda_l2': 0.08,
              'bagging_freq': 1,# k回に一回バギング
              'feature_fraction': 0.7,
              'learning_rate': 0.01,
              'seed': 29,
              'verbosity': -1,
              'n_jobs': -1,
             }

cat_params = {'loss_function': 'Logloss',
              'eval_metric':  'Logloss',
              'max_depth' : 7,
              'min_data_in_leaf': 56,
              'l2_leaf_reg': 109.68,
              'colsample_bylevel': 0.93,
              'subsample': 0.98,
              'max_bin': 464,
              'od_type': 'Iter',
              'num_boost_round': 10000,
              'learning_rate': 0.03,
              'random_seed': 29,
              'verbose': 100,
             }

In [None]:
%%time
# ==============================
# XGBoost
# ==============================
XGB_FEAT = CFG.cont_cols + OHE_COLS 
xgb_oof = df_oof.copy()
models = []
for fold in range(CFG.n_fold):
    if not fold in CFG.trn_fold:
        continue
    print(f"{'='*38} Fold: {fold} {'='*38}")
    
    # training
    oof_pred, model, score = fit_xgb(CFG,
                                     df_train.loc[df_train['fold'] != fold, XGB_FEAT],
                                     df_train.loc[df_train['fold'] != fold, CFG.target_col],
                                     df_train.loc[df_train['fold'] == fold, XGB_FEAT],
                                     df_train.loc[df_train['fold'] == fold, CFG.target_col],
                                     xgb_params)
    
    # oof
    xgb_oof.loc[df_train['fold'] == fold,['pred']] = oof_pred
    models.append(model)
    
print(f'roc_auc:{roc_auc_score(xgb_oof.Transported, xgb_oof.pred)}')
xgb_oof['pred_bool'] = xgb_oof.pred > 0.5
print(f'accuracy:{accuracy_score(xgb_oof.Transported, xgb_oof.pred_bool)}')
y_preds =[]
for model in models:
    y_pred = model.predict(xgb.DMatrix(df_test[XGB_FEAT]))
    y_preds.append(y_pred)
xgb_pred = np.mean(y_preds,axis=0)

In [None]:
%%time
# ==============================
# LightGBM
# ==============================
LGB_FEAT = CFG.cont_cols + OE_COLS 
lgb_oof = df_oof.copy()
models = []
for fold in range(CFG.n_fold):
    if not fold in CFG.trn_fold:
        continue
    print(f"{'='*38} Fold: {fold} {'='*38}")
    
    # training
    oof_pred, model, score = fit_lgbm(CFG,
                                      df_train.loc[df_train['fold'] != fold, LGB_FEAT],
                                      df_train.loc[df_train['fold'] != fold, CFG.target_col],
                                      df_train.loc[df_train['fold'] == fold, LGB_FEAT],
                                      df_train.loc[df_train['fold'] == fold, CFG.target_col],
                                      lgb_params)
    
    # oof
    lgb_oof.loc[df_train['fold'] == fold,['pred']] = oof_pred
    models.append(model)
    
print(f'roc_auc:{roc_auc_score(lgb_oof.Transported, lgb_oof.pred)}')
lgb_oof['pred_bool'] = lgb_oof.pred > 0.5
print(f'accuracy:{accuracy_score(lgb_oof.Transported, lgb_oof.pred_bool)}')
y_preds =[]
for model in models:
    y_pred = model.predict(df_test[LGB_FEAT], num_iteration=model.best_iteration)
    y_preds.append(y_pred)
lgb_pred = np.mean(y_preds,axis=0)

In [None]:
%%time
# ==============================
# CatBoost
# ==============================
df_train[CFG.cate_cols] = df_train[CFG.cate_cols].astype(str)
df_test[CFG.cate_cols] = df_test[CFG.cate_cols].astype(str)

CAT_FEAT = CFG.cont_cols  + CFG.cate_cols
cat_oof = df_oof.copy()
models = []
for fold in range(CFG.n_fold):
    if not fold in CFG.trn_fold:
        continue
    print(f"{'='*38} Fold: {fold} {'='*38}")
    
    # training
    oof_pred, model, score = fit_cat(CFG,
                                     df_train.loc[df_train['fold'] != fold, CAT_FEAT],
                                     df_train.loc[df_train['fold'] != fold, CFG.target_col],
                                     df_train.loc[df_train['fold'] == fold, CAT_FEAT],
                                     df_train.loc[df_train['fold'] == fold, CFG.target_col],
                                     cat_params)
    
    # oof
    cat_oof.loc[df_train['fold'] == fold,['pred']] = oof_pred
    models.append(model)
    
print(f'roc_auc:{roc_auc_score(cat_oof.Transported, cat_oof.pred)}')
cat_oof['pred_bool'] = cat_oof.pred > 0.5
print(f'accuracy:{accuracy_score(cat_oof.Transported, cat_oof.pred_bool)}')
y_preds =[]
for model in models:
    y_pred = model.predict(Pool(df_test[CAT_FEAT],cat_features=CFG.cate_cols),prediction_type='Probability')[:,1]
    y_preds.append(y_pred)
cat_pred = np.mean(y_preds,axis=0)

## Ensemble CV sub

In [None]:
df_oof.pred = (xgb_oof.pred + lgb_oof.pred + cat_oof.pred)/3
print(f'roc_auc:{roc_auc_score(df_oof.Transported, df_oof.pred)}')
df_oof['pred_bool'] = df_oof.pred > 0.5
print(f'accuracy:{accuracy_score(df_oof.Transported, df_oof.pred_bool)}')

In [None]:
df_sub['Transported'] = np.mean([xgb_pred,lgb_pred,cat_pred],axis=0) > 0.5
df_sub.to_csv('submission.csv', index=False)

In [None]:
df_sub