In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import sqlalchemy
from dotenv import load_dotenv
import lightgbm as lgb
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold

from mylib.db.core import DataBase

load_dotenv()
sns.set()
ID_COLS = ["passenger_id"]

## load data

In [3]:
db = DataBase(os.environ["SQLITE_DB_DIR"])
db.main_database

PosixPath('/workspaces/spaceship-titanic/data/db/main.db')

In [4]:
with db.connect() as connection:
    df_train = pd.read_sql(
        sqlalchemy.select(db.metadata.tables["train.original"]),
        connection,
    ).set_index(ID_COLS)
    df_target = pd.read_sql(
        sqlalchemy.select(db.metadata.tables["train.target"]),
        connection,
    ).set_index(ID_COLS)

## Model

In [5]:
features = [
    "home_planet",
    "cryo_sleep",
    "destination",
    "age",
    "vip",
    "room_service",
    "food_court",
    "shopping_mall",
    "spa",
    "vr_deck",
]
target = "transported"
X = df_train[features]
y = df_target[target]

In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8693 entries, 0001_01 to 9280_02
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   home_planet    8492 non-null   object 
 1   cryo_sleep     8476 non-null   object 
 2   destination    8511 non-null   object 
 3   age            8514 non-null   float64
 4   vip            8490 non-null   object 
 5   room_service   8512 non-null   float64
 6   food_court     8510 non-null   float64
 7   shopping_mall  8485 non-null   float64
 8   spa            8510 non-null   float64
 9   vr_deck        8505 non-null   float64
dtypes: float64(6), object(4)
memory usage: 747.1+ KB


In [7]:
def downcast_dtype(df: pd.DataFrame) -> pd.DataFrame:
    df_ = df.copy()
    df_ = df_.assign(**df_.select_dtypes("O").astype("category"))
    df_ = df_.assign(**df_.select_dtypes("number").astype("float32"))
    return df_

In [8]:
preprocess = FunctionTransformer(
    func=downcast_dtype,
    feature_names_out="one-to-one",
)

In [12]:
from mylib.train.gbdt.lightgbm import (
    LGBMTrainer,
    DEFAULT_PARAMS,
    empty_array_defaultdict,
)
from mylib.data import make_cv_split_train_val_test
from sklearn.base import check_is_fitted
from sklearn import metrics
import time

In [23]:
scorings = ["accuracy", "recall", "precision", "f1"]
scorers = {scoring: metrics.get_scorer(scoring) for scoring in scorings}

trainer = LGBMTrainer(
    model=lgb.LGBMClassifier(),
    params=DEFAULT_PARAMS,
    preprocesser=preprocess,
    cv=make_cv_split_train_val_test(5),
    scorers=scorers,
)

In [11]:
from itertools import islice
from collections.abc import Sequence
from sklearn.utils import _safe_indexing
from collections import defaultdict
from mylib.log import Timer

timer = Timer()


def get_fold_indices(
    X: pd.DataFrame, y: pd.Series, cv, fold_num: int
) -> Sequence[np.ndarray]:
    return next(islice(cv.split(X, y), fold_num, fold_num + 1))


def split_X_y_by_indices(
    X: pd.DataFrame,
    y: pd.Series,
    *indices,
):
    return list((_safe_indexing(X, ind), _safe_indexing(y, ind)) for ind in indices)

In [20]:
n_splits = trainer.cv.get_n_splits()
pred = np.full_like(y, fill_value=np.nan)
cv_results = list()
multi_metric_scorer = trainer.init_multi_metric_scorer()

for i in range(n_splits):
    fold_results = dict()

    train_idx, valid_idx, test_idx = get_fold_indices(X, y, trainer.cv, i)
    train_set, valid_set, test_set = split_X_y_by_indices(
        X, y, train_idx, valid_idx, test_idx
    )

    with timer.measure(f"training fold {i}"):
        model = trainer.train_fold(
            train_set=train_set,
            valid_set=valid_set,
            stopping_rounds=5,
        )
    fold_results["training_time"] = timer.duration

    with timer.measure(f"scoring fold {i}"):
        fold_results["test"] = multi_metric_scorer(model, *test_set)
        fold_results["train"] = multi_metric_scorer(model, *train_set)
    fold_results["scoring_time"] = timer.duration

    cv_results.append(fold_results)

    y_pred = model.predict(test_set[0])
    pred[test_idx] = y_pred

[training fold 0] start.
[LightGBM] [Info] Number of positive: 2740, number of negative: 2475
[LightGBM] [Info] Total Bins 1364
[LightGBM] [Info] Number of data points in the train set: 5215, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.525407 -> initscore=0.101718
[LightGBM] [Info] Start training from score 0.101718
Training until validation scores don't improve for 5 rounds
[5]	train's binary_logloss: 0.578805	valid's binary_logloss: 0.589848
[10]	train's binary_logloss: 0.518358	valid's binary_logloss: 0.53522
[15]	train's binary_logloss: 0.486228	valid's binary_logloss: 0.506605
[20]	train's binary_logloss: 0.465788	valid's binary_logloss: 0.489246
[25]	train's binary_logloss: 0.45133	valid's binary_logloss: 0.476972
[30]	train's binary_logloss: 0.440699	valid's binary_logloss: 0.465389
[35]	train's binary_logloss: 0.43333	valid's binary_logloss: 0.460044
[40]	train's binary_logloss: 0.427009	valid's binary_logloss: 0.455854
[45]	train's binary_logl

In [24]:
trainer.fit(X, y)

[LightGBM] [Info] Number of positive: 2740, number of negative: 2475
[LightGBM] [Info] Total Bins 1364
[LightGBM] [Info] Number of data points in the train set: 5215, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.525407 -> initscore=0.101718
[LightGBM] [Info] Start training from score 0.101718
Training until validation scores don't improve for 5 rounds
[5]	train's binary_logloss: 0.578805	valid's binary_logloss: 0.589848
[10]	train's binary_logloss: 0.518358	valid's binary_logloss: 0.53522
[15]	train's binary_logloss: 0.486228	valid's binary_logloss: 0.506605
[20]	train's binary_logloss: 0.465788	valid's binary_logloss: 0.489246
[25]	train's binary_logloss: 0.45133	valid's binary_logloss: 0.476972
[30]	train's binary_logloss: 0.440699	valid's binary_logloss: 0.465389
[35]	train's binary_logloss: 0.43333	valid's binary_logloss: 0.460044
[40]	train's binary_logloss: 0.427009	valid's binary_logloss: 0.455854
[45]	train's binary_logloss: 0.422302	valid's bin

In [15]:
trainer.get_cv_results()

Unnamed: 0,training_time,scoring_time,test_accuracy,test_recall,test_precision,test_f1,train_accuracy,train_recall,train_precision,train_f1
0,0.101966,0.047411,0.778033,0.864932,0.715897,0.783389,0.816874,0.859124,0.805337,0.831361
1,0.08424,0.056877,0.790109,0.78564,0.826039,0.805333,0.808629,0.84751,0.786629,0.815935
2,0.078943,0.0425,0.795167,0.778361,0.836343,0.806311,0.804256,0.83286,0.771225,0.800858
3,0.077981,0.04154,0.805524,0.860943,0.761497,0.808173,0.801993,0.839169,0.780043,0.808526
4,0.084255,0.050894,0.786659,0.870036,0.733266,0.795817,0.809241,0.857353,0.793467,0.824174


In [25]:
trainer.fitted_models_

[Pipeline(steps=[('preprocessor',
                  FunctionTransformer(feature_names_out='one-to-one',
                                      func=<function downcast_dtype at 0x7f57f42f5430>)),
                 ('model',
                  LGBMClassifier(colsample_bytree=0.6, force_row_wise=True,
                                 num_leaves=7, random_state=0, subsample=0.6,
                                 subsample_freq=1))]),
 Pipeline(steps=[('preprocessor',
                  FunctionTransformer(feature_names_out='one-to-one',
                                      func=<function downcast_dtype at 0x7f57f42f5430>)),
                 ('model',
                  LGBMClassifier(colsample_bytree=0.6, force_row_wise=True,
                                 num_leaves=7, random_state=0, subsample=0.6,
                                 subsample_freq=1))]),
 Pipeline(steps=[('preprocessor',
                  FunctionTransformer(feature_names_out='one-to-one',
                                    