In [None]:
import gc
import importlib
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import polars as pl
import seaborn as sns
from atmacup_18 import constants

import utils

importlib.reload(utils)

In [None]:
RANDOM_STATE = 2024
utils.seed_everything(RANDOM_STATE)

## データ読み込み

In [None]:
notebook_dir = Path().resolve()
DATA_DIR = notebook_dir.parents[3].joinpath("data")
DATASET_DIR = DATA_DIR.joinpath("atmaCup#18_dataset")
TR_FEATURES_CSV = DATASET_DIR.joinpath("train_features.csv")
TS_FEATURES_CSV = DATASET_DIR.joinpath("test_features.csv")
IMAGES_DIR = DATASET_DIR.joinpath("images")
TRAFFIC_LIGHTS_CSV = DATASET_DIR.joinpath("traffic_lights.csv")

IMAGE_NAMES = ["image_t.png", "image_t-0.5.png", "image_t-1.0.png"]
TRAFFIC_LIGHTS_BBOX_IMAGE_NAME = constants.TRAFFIC_LIGHT_BBOX_IMAGE_NAME
DEPTH_IMAGE_FILE_PREFIX = constants.DEPTH_IMAGE_FILE_PREFIX
DEPTH_IMAGE_NAMES = [
    f"{DEPTH_IMAGE_FILE_PREFIX}{image_name}.npy" for image_name in IMAGE_NAMES
]

BASE_PRED_DIR = Path("..", "..", "..", "main2", "v00", "v00_05_00")
BASE_OOF_PRED_CSV = BASE_PRED_DIR.joinpath("oof_preds.csv")
BASE_SUBMISSION_CSV = BASE_PRED_DIR.joinpath("submission.csv")

In [None]:
TARGET_COLS = sum([[f"x_{i}", f"y_{i}", f"z_{i}"] for i in range(6)], [])
BASE_PRED_COLS = [f"base_pred_{col}" for col in TARGET_COLS]

In [None]:
tr_df = utils.read_feature_csv(TR_FEATURES_CSV)
tr_df.head(2)

In [None]:
ts_df = utils.read_feature_csv(TS_FEATURES_CSV)
ts_df.head(2)

In [None]:
def reduce_base_pred(
    df: pl.DataFrame, base_pred_df: pl.DataFrame, has_target: bool
) -> pl.DataFrame:
    """
    基礎推定値のdfをconcatし、元のdfのtarget列から除去したdfを返す

    Args:
        df (pl.DataFrame): target列を持つDataFrame
        base_pred_df (pl.DataFrame): 基礎推定値のDataFrame
        has_target (bool): target列を持つかどうか
    """
    target_cols = TARGET_COLS
    base_pred_cols = BASE_PRED_COLS
    df = pl.concat(
        [
            df,
            base_pred_df.select(TARGET_COLS).rename(
                {t: b for t, b in zip(target_cols, base_pred_cols)}
            ),
        ],
        how="horizontal",
    )

    if has_target:
        df = df.with_columns(
            [
                (pl.col(tg_col) - pl.col(base_pred_col)).alias(tg_col)
                for tg_col, base_pred_col in zip(target_cols, base_pred_cols)
            ]
        )
    return df


def add_base_pred_to_target(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame:
    """
    target_colsの列に基礎推定量を足したDataFrameを返す

    Args:
        df (pl.DataFrame): target列を持つDataFrame
        target_cols (list[str]): 追加する基礎推定量の列名
    """
    df = df.with_columns(
        [
            (pl.col(tg_col) + pl.col(bp_col)).alias(tg_col)
            for tg_col, bp_col in zip(target_cols, BASE_PRED_COLS)
        ]
    )

    return df


if BASE_PRED_DIR is not None:
    # columns: "x_0", "y_0", "z_0", ..., "x_5", "y_5", "z_5"
    base_oof_pred_df = pl.read_csv(BASE_OOF_PRED_CSV)
    base_submission_df = pl.read_csv(BASE_SUBMISSION_CSV)

    # 基礎推定値を元のtarget列から引いた値を新たなtarget列とする
    tr_df = reduce_base_pred(tr_df, base_oof_pred_df, has_target=True)
    ts_df = reduce_base_pred(ts_df, base_submission_df, has_target=False)

    del base_oof_pred_df, base_submission_df
    gc.collect()

In [None]:
tr_tl_bbox_images = utils.load_npy_images(
    IMAGES_DIR,
    ids=tr_df.get_column("ID").to_list(),
    image_names=[TRAFFIC_LIGHTS_BBOX_IMAGE_NAME],
)
print(tr_tl_bbox_images.shape)
ts_tl_bbox_images = utils.load_npy_images(
    IMAGES_DIR,
    ids=ts_df.get_column("ID").to_list(),
    image_names=[TRAFFIC_LIGHTS_BBOX_IMAGE_NAME],
)
print(ts_tl_bbox_images.shape)

In [None]:
tr_depth_images = utils.load_npy_images(
    IMAGES_DIR,
    ids=tr_df.get_column("ID").to_list(),
    image_names=DEPTH_IMAGE_NAMES,
)
print(tr_depth_images.shape)
ts_depth_images = utils.load_npy_images(
    IMAGES_DIR,
    ids=ts_df.get_column("ID").to_list(),
    image_names=DEPTH_IMAGE_NAMES,
)
print(ts_depth_images.shape)

In [None]:
tr_images = utils.load_images(
    IMAGES_DIR, ids=tr_df.get_column("ID").to_list(), image_names=IMAGE_NAMES
)
print(tr_images.shape)
ts_images = utils.load_images(
    IMAGES_DIR, ids=ts_df.get_column("ID").to_list(), image_names=IMAGE_NAMES
)
print(ts_images.shape)

In [None]:
tr_images = utils.preprocess_images(
    # [tr_images, tr_tl_bbox_images, tr_optical_flow_images]
    [tr_images, tr_tl_bbox_images, tr_depth_images]
    # [tr_images]
)
ts_images = utils.preprocess_images(
    # [ts_images, ts_tl_bbox_images, ts_optical_flow_images]
    [ts_images, ts_tl_bbox_images, ts_depth_images]
    # [ts_images]
)

print(tr_images.shape)
print(ts_images.shape)

In [None]:
# del tr_tl_bbox_images
# gc.collect()
#
# del ts_tl_bbox_images
# gc.collect()
#
# del tr_optical_flow_images
# gc.collect()
#
# del ts_optical_flow_images
# gc.collect()

## scene_dsec順に並び替える

In [None]:
tr_df = tr_df.sort(["scene_id", "scene_dsec"])
ts_df = ts_df.sort(["scene_id", "scene_dsec"])

tr_images = tr_images[tr_df.get_column("origin_idx").to_numpy()]
ts_images = ts_images[ts_df.get_column("origin_idx").to_numpy()]

## Target

In [None]:
target = utils.CoordinateTarget(prefix="tg_")
target.fit(tr_df)

tg_df = target.transform(tr_df)
print(tg_df.columns)
print(tg_df.describe().glimpse())
tr_df = pl.concat([tr_df, tg_df], how="horizontal")

del tg_df
gc.collect()

## 特徴量

In [None]:
feature = utils.Feature(prefix="ft_")
feature.fit(tr_df)

ft_df = feature.transform(tr_df)
print(ft_df.columns)
print(ft_df.describe().glimpse())
tr_df = pl.concat([tr_df, ft_df], how="horizontal")

ft_df = feature.transform(ts_df)
print(ft_df.columns)
print(ft_df.describe().glimpse())
ts_df = pl.concat([ts_df, ft_df], how="horizontal")

del ft_df
gc.collect()

## モデリング

In [None]:
N_SPLITS = 2

In [None]:
n_sample_in_scene = 3

model_params = {
    "dnn": {
        "n_sample_in_scene": n_sample_in_scene,
        "n_img_channels": tr_images.shape[1] * n_sample_in_scene,
        "n_features": len(feature.columns) * n_sample_in_scene,
        "n_targets": len(target.columns),
        "dropout": 0.0,
        "embed_dim": 128,
        "n_layers": 1,
    },
    "dnn_pretrained_model": {
        # list[str]: len(list) == n_splits
        "weight_path": None,
        "load_only_backbone": None,
    },
    "dev": "cuda",
}

lr = 5e-5
fit_params = {
    "dnn": {
        "tr_batch_size": 16,
        "vl_batch_size": 16,
        "trainer_params": {
            "criterion_params": {},
            "opt": "adamw",
            "opt_params": {"lr": lr, "weight_decay": 1e-4},
            "backbone_opt_params": {"lr": lr, "weight_decay": 1e-4},
            "sch_params": {
                "max_lr": lr,
                "pct_start": 0.1,
                "div_factor": 25,
                "final_div_factor": 1000,
            },
            "epochs": 10,
            "dev": "cuda",
            "val_freq": 1,
            "prefix": "",
            "save_best": False,
            "save_epochs": [],
            "maximize_score": False,
            "grad_max_norm": None,
        },
    },
}

In [None]:
models, oof_preds = utils.train(
    model_params=model_params,
    fit_params=fit_params,
    df=tr_df,
    images=tr_images,
    target_cols=target.columns,
    feature_cols=feature.columns,
    group_col="scene_id",
    scene_id_col="scene_id",
    scene_dsec_col="scene_dsec",
    n_splits=N_SPLITS,
)

In [None]:
oof_preds = oof_preds.select(pl.all().name.prefix("pred_"))
pred_cols = oof_preds.columns

tr_df = pl.concat([tr_df, oof_preds], how="horizontal")
tr_df

## 評価

In [None]:
def calc_score(df: pl.DataFrame, pred_cols: list[str]):
    tg_cols = sum([[f"x_{i}", f"y_{i}", f"z_{i}"] for i in range(6)], [])

    tg = df.select(tg_cols).to_numpy()
    pred = df.select(pred_cols).to_numpy()

    scores = np.abs(tg - pred).mean(axis=0)
    scores = {f"score_{col}": float(score) for col, score in zip(pred_cols, scores)}
    scores["avg"] = float(np.abs(tg - pred).mean())
    return scores


scores = calc_score(tr_df, pred_cols)
scores

In [None]:
utils.plot_calibration_curve(tr_df, pred_cols, n_bins=40)

In [None]:
if BASE_PRED_DIR is not None:
    # 差し引いていた基礎推定値を足して元のtarget, pred列に戻す
    tr_df = add_base_pred_to_target(tr_df, TARGET_COLS)
    tr_df = add_base_pred_to_target(tr_df, pred_cols)

In [None]:
if BASE_PRED_DIR is not None:
    scores = calc_score(tr_df, pred_cols)
    display(scores)

In [None]:
if BASE_PRED_DIR is not None:
    utils.plot_calibration_curve(tr_df, pred_cols, n_bins=40)

## oofを保存

In [None]:
def create_submission_csv(preds: pl.DataFrame, filename: str = "submission.csv"):
    submission_cols = TARGET_COLS

    # validate preds columns
    if len(preds.columns) != len(submission_cols):
        raise ValueError(
            f"preds columns must be {len(submission_cols)}, but got {len(preds.columns)}"
        )

    preds.columns = submission_cols
    preds.write_csv(filename)
    print(f"Submission file is created: {filename}")


# 元の順番に戻して保存
create_submission_csv(tr_df.sort("origin_idx").select(pred_cols), "oof_preds.csv")

## Submission

In [None]:
preds = utils.predict(
    models,
    ts_images,
    ts_df,
    feature.columns,
    scene_id_col="scene_id",
    scene_dsec_col="scene_dsec",
    pred_cols=pred_cols,
)
pred_cols = preds.columns
ts_df = pl.concat([ts_df, preds], how="horizontal")

preds

In [None]:
if BASE_PRED_DIR is not None:
    # 差し引いていた基礎推定値を足して元のtarget, pred列に戻す
    ts_df = add_base_pred_to_target(ts_df, pred_cols)
    display(ts_df)

In [None]:
# 元の順番に戻す
ts_df = ts_df.sort("origin_idx")

In [None]:
create_submission_csv(ts_df.select(pred_cols), "submission.csv")