In [1]:
import gc
import importlib
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import polars as pl
import seaborn as sns
from atmacup_18 import constants

import utils

importlib.reload(utils)

<module 'utils' from '/home/tatsuya/projects/atmacup/atmacup_18/experiments/main2/v00/v00_12_03/utils.py'>

In [2]:
RANDOM_STATE = 2024 + 1

## データ読み込み

In [3]:
notebook_dir = Path().resolve()
DATA_DIR = notebook_dir.parents[3].joinpath("data")
DATASET_DIR = DATA_DIR.joinpath("atmaCup#18_dataset")
TR_FEATURES_CSV = DATASET_DIR.joinpath("train_features.csv")
TS_FEATURES_CSV = DATASET_DIR.joinpath("test_features.csv")
IMAGES_DIR = DATASET_DIR.joinpath("images")
TRAFFIC_LIGHTS_CSV = DATASET_DIR.joinpath("traffic_lights.csv")

IMAGE_NAMES = ["image_t.png", "image_t-0.5.png", "image_t-1.0.png"]
TRAFFIC_LIGHTS_BBOX_IMAGE_NAME = constants.TRAFFIC_LIGHT_BBOX_IMAGE_NAME
DEPTH_IMAGE_FILE_PREFIX = constants.DEPTH_IMAGE_FILE_PREFIX
DEPTH_IMAGE_NAMES = [
    f"{DEPTH_IMAGE_FILE_PREFIX}{image_name}.npy" for image_name in IMAGE_NAMES
]
OPTICAL_FLOW_IMAGE_NAME = constants.OPTICAL_FLOW_IMAGE_NAME

TR_IMAGE_FEATURE_TYPE2_CSV = DATASET_DIR.joinpath("train_image_features_type2.csv")
TS_IMAGE_FEATURE_TYPE2_CSV = DATASET_DIR.joinpath("test_image_features_type2.csv")

BASE_PRED_DIR = Path("..", "..", "..", "main", "v00", "v00_15_03")
BASE_OOF_PRED_CSV = BASE_PRED_DIR.joinpath("oof_preds.csv")
BASE_SUBMISSION_CSV = BASE_PRED_DIR.joinpath("submission.csv")

In [4]:
TARGET_COLS = sum([[f"x_{i}", f"y_{i}", f"z_{i}"] for i in range(6)], [])
BASE_PRED_COLS = [f"base_pred_{col}" for col in TARGET_COLS]

In [5]:
tr_df = utils.read_feature_csv(TR_FEATURES_CSV)
tr_df.head(2)

ID,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,leftBlinker,rightBlinker,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5,scene_id,scene_dsec,origin_idx
str,f64,f64,f64,f64,f64,bool,f64,bool,str,bool,bool,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,i32,i64
"""00066be8e20318869c38c66be46663…",5.701526,1.538456,-2.165777,-139.0,0.0,False,0.25,True,"""drive""",False,False,2.82959,0.032226,0.045187,6.231999,0.065895,0.107974,9.785009,0.124972,0.203649,13.485472,0.163448,0.302818,17.574227,0.174289,0.406331,21.951269,0.199503,0.485079,"""00066be8e20318869c38c66be46663…",320,0
"""00066be8e20318869c38c66be46663…",11.176292,0.279881,-11.625697,-44.0,0.0,False,0.0,False,"""drive""",False,True,4.970268,-0.007936,0.005028,10.350489,-0.032374,-0.020701,15.770054,0.084073,0.008645,21.132415,0.391343,0.036335,26.316489,0.843124,0.065,31.383814,1.42507,0.073083,"""00066be8e20318869c38c66be46663…",420,1


In [6]:
ts_df = utils.read_feature_csv(TS_FEATURES_CSV)
ts_df.head(2)

ID,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,leftBlinker,rightBlinker,scene_id,scene_dsec,origin_idx
str,f64,f64,f64,f64,f64,bool,f64,bool,str,bool,bool,str,i32,i64
"""012baccc145d400c896cb82065a93d…",3.374273,-0.01936,-34.008415,17.0,0.0,False,0.0,False,"""drive""",False,False,"""012baccc145d400c896cb82065a93d…",120,0
"""012baccc145d400c896cb82065a93d…",2.441048,-0.022754,307.860077,295.0,0.0,True,0.0,False,"""drive""",False,False,"""012baccc145d400c896cb82065a93d…",220,1


In [7]:
tr_image_feat_df, ts_image_feat_df = utils.read_image_feature_type2_csv(
    TR_IMAGE_FEATURE_TYPE2_CSV,
    TS_IMAGE_FEATURE_TYPE2_CSV,
    n_patch=16,
    n_components=16,
    prefix="type2_",
)

tr_df = tr_df.join(tr_image_feat_df, on="ID")
ts_df = ts_df.join(ts_image_feat_df, on="ID")

del tr_image_feat_df, ts_image_feat_df
gc.collect()

tr_df.head(2)

ID,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,leftBlinker,rightBlinker,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5,scene_id,scene_dsec,origin_idx,type2_image_feat_patch_0_pca_0_image_t-0.5.png,type2_image_feat_patch_0_pca_0_image_t-1.0.png,type2_image_feat_patch_0_pca_0_image_t.png,type2_image_feat_patch_0_pca_1_image_t-0.5.png,…,type2_image_feat_patch_15_pca_3_image_t.png,type2_image_feat_patch_15_pca_4_image_t-0.5.png,type2_image_feat_patch_15_pca_4_image_t-1.0.png,type2_image_feat_patch_15_pca_4_image_t.png,type2_image_feat_patch_15_pca_5_image_t-0.5.png,type2_image_feat_patch_15_pca_5_image_t-1.0.png,type2_image_feat_patch_15_pca_5_image_t.png,type2_image_feat_patch_15_pca_6_image_t-0.5.png,type2_image_feat_patch_15_pca_6_image_t-1.0.png,type2_image_feat_patch_15_pca_6_image_t.png,type2_image_feat_patch_15_pca_7_image_t-0.5.png,type2_image_feat_patch_15_pca_7_image_t-1.0.png,type2_image_feat_patch_15_pca_7_image_t.png,type2_image_feat_patch_15_pca_8_image_t-0.5.png,type2_image_feat_patch_15_pca_8_image_t-1.0.png,type2_image_feat_patch_15_pca_8_image_t.png,type2_image_feat_patch_15_pca_9_image_t-0.5.png,type2_image_feat_patch_15_pca_9_image_t-1.0.png,type2_image_feat_patch_15_pca_9_image_t.png,type2_image_feat_patch_15_pca_10_image_t-0.5.png,type2_image_feat_patch_15_pca_10_image_t-1.0.png,type2_image_feat_patch_15_pca_10_image_t.png,type2_image_feat_patch_15_pca_11_image_t-0.5.png,type2_image_feat_patch_15_pca_11_image_t-1.0.png,type2_image_feat_patch_15_pca_11_image_t.png,type2_image_feat_patch_15_pca_12_image_t-0.5.png,type2_image_feat_patch_15_pca_12_image_t-1.0.png,type2_image_feat_patch_15_pca_12_image_t.png,type2_image_feat_patch_15_pca_13_image_t-0.5.png,type2_image_feat_patch_15_pca_13_image_t-1.0.png,type2_image_feat_patch_15_pca_13_image_t.png,type2_image_feat_patch_15_pca_14_image_t-0.5.png,type2_image_feat_patch_15_pca_14_image_t-1.0.png,type2_image_feat_patch_15_pca_14_image_t.png,type2_image_feat_patch_15_pca_15_image_t-0.5.png,type2_image_feat_patch_15_pca_15_image_t-1.0.png,type2_image_feat_patch_15_pca_15_image_t.png
str,f64,f64,f64,f64,f64,bool,f64,bool,str,bool,bool,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,i32,i64,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""00066be8e20318869c38c66be46663…",5.701526,1.538456,-2.165777,-139.0,0.0,False,0.25,True,"""drive""",False,False,2.82959,0.032226,0.045187,6.231999,0.065895,0.107974,9.785009,0.124972,0.203649,13.485472,0.163448,0.302818,17.574227,0.174289,0.406331,21.951269,0.199503,0.485079,"""00066be8e20318869c38c66be46663…",320,0,-12.092827,-12.364141,-11.997147,8.27103,…,-3.453253,-0.561866,-0.33962,-1.279081,-2.726123,-3.054861,-0.62786,0.599287,-3.205981,-0.895183,2.281659,0.463407,2.384759,1.834266,1.64006,-0.025069,-0.137119,1.246251,0.85394,0.861053,3.992823,1.128606,0.187762,-1.105715,-1.048375,1.43152,0.066033,0.900796,0.512962,1.179129,1.80958,3.061294,4.06483,1.2968,-2.511726,-2.452925,-2.568694
"""00066be8e20318869c38c66be46663…",11.176292,0.279881,-11.625697,-44.0,0.0,False,0.0,False,"""drive""",False,True,4.970268,-0.007936,0.005028,10.350489,-0.032374,-0.020701,15.770054,0.084073,0.008645,21.132415,0.391343,0.036335,26.316489,0.843124,0.065,31.383814,1.42507,0.073083,"""00066be8e20318869c38c66be46663…",420,1,-6.905368,-8.808874,-7.941122,9.494776,…,-0.789773,0.476332,2.356999,2.059506,-2.894734,-2.690122,-0.547088,0.445774,1.994462,1.786956,4.242107,1.172755,1.810559,-0.712943,-3.324618,-1.0987,-5.017344,-3.527495,-3.741179,-1.050099,-0.819215,-3.378464,-1.995003,-3.214493,-0.993306,1.991544,4.563295,2.447225,-3.706869,-0.717827,-4.541399,3.421139,3.321804,4.889874,6.756308,2.842822,2.864973


In [8]:
def reduce_base_pred(
    df: pl.DataFrame, base_pred_df: pl.DataFrame, has_target: bool
) -> pl.DataFrame:
    """
    基礎推定値のdfをconcatし、元のdfのtarget列から除去したdfを返す

    Args:
        df (pl.DataFrame): target列を持つDataFrame
        base_pred_df (pl.DataFrame): 基礎推定値のDataFrame
        has_target (bool): target列を持つかどうか
    """
    target_cols = TARGET_COLS
    base_pred_cols = BASE_PRED_COLS
    df = pl.concat(
        [
            df,
            base_pred_df.select(TARGET_COLS).rename(
                {t: b for t, b in zip(target_cols, base_pred_cols)}
            ),
        ],
        how="horizontal",
    )

    if has_target:
        df = df.with_columns(
            [
                (pl.col(tg_col) - pl.col(base_pred_col)).alias(tg_col)
                for tg_col, base_pred_col in zip(target_cols, base_pred_cols)
            ]
        )
    return df


def add_base_pred_to_target(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame:
    """
    target_colsの列に基礎推定量を足したDataFrameを返す

    Args:
        df (pl.DataFrame): target列を持つDataFrame
        target_cols (list[str]): 追加する基礎推定量の列名
    """
    df = df.with_columns(
        [
            (pl.col(tg_col) + pl.col(bp_col)).alias(tg_col)
            for tg_col, bp_col in zip(target_cols, BASE_PRED_COLS)
        ]
    )

    return df


if BASE_PRED_DIR is not None:
    # columns: "x_0", "y_0", "z_0", ..., "x_5", "y_5", "z_5"
    base_oof_pred_df = pl.read_csv(BASE_OOF_PRED_CSV)
    base_submission_df = pl.read_csv(BASE_SUBMISSION_CSV)

    # 基礎推定値を元のtarget列から引いた値を新たなtarget列とする
    tr_df = reduce_base_pred(tr_df, base_oof_pred_df, has_target=True)
    ts_df = reduce_base_pred(ts_df, base_submission_df, has_target=False)

    del base_oof_pred_df, base_submission_df
    gc.collect()

In [9]:
tr_depth_images = utils.load_npy_images(
    IMAGES_DIR,
    ids=tr_df.get_column("ID").to_list(),
    image_names=DEPTH_IMAGE_NAMES,
).astype(np.float32)
print(tr_depth_images.shape)
ts_depth_images = utils.load_npy_images(
    IMAGES_DIR,
    ids=ts_df.get_column("ID").to_list(),
    image_names=DEPTH_IMAGE_NAMES,
).astype(np.float32)
print(ts_depth_images.shape)

(43371, 3, 64, 128, 1)
(1727, 3, 64, 128, 1)


In [10]:
tr_optical_flow_images = utils.load_npy_images(
    IMAGES_DIR,
    ids=tr_df.get_column("ID").to_list(),
    image_names=[OPTICAL_FLOW_IMAGE_NAME],
).astype(np.float32)
print(tr_optical_flow_images.shape)
ts_optical_flow_images = utils.load_npy_images(
    IMAGES_DIR,
    ids=ts_df.get_column("ID").to_list(),
    image_names=[OPTICAL_FLOW_IMAGE_NAME],
).astype(np.float32)
print(ts_optical_flow_images.shape)

(43371, 1, 64, 128, 4)
(1727, 1, 64, 128, 4)


## scene_dsec順に並び替える

In [11]:
tr_df = tr_df.sort(["scene_id", "scene_dsec"])
ts_df = ts_df.sort(["scene_id", "scene_dsec"])

tr_depth_images = tr_depth_images[tr_df.get_column("origin_idx").to_numpy()]
ts_depth_images = ts_depth_images[ts_df.get_column("origin_idx").to_numpy()]

tr_optical_flow_images = tr_optical_flow_images[
    tr_df.get_column("origin_idx").to_numpy()
]
ts_optical_flow_images = ts_optical_flow_images[
    ts_df.get_column("origin_idx").to_numpy()
]

## Target

In [12]:
target = utils.CoordinateTarget(prefix="tg_")
target.fit(tr_df)

tg_df = target.transform(tr_df)
print(tg_df.columns)
print(tg_df.describe().glimpse())
tr_df = pl.concat([tr_df, tg_df], how="horizontal")

del tg_df
gc.collect()

['tg_cood_x_0', 'tg_cood_y_0', 'tg_cood_z_0', 'tg_cood_x_1', 'tg_cood_y_1', 'tg_cood_z_1', 'tg_cood_x_2', 'tg_cood_y_2', 'tg_cood_z_2', 'tg_cood_x_3', 'tg_cood_y_3', 'tg_cood_z_3', 'tg_cood_x_4', 'tg_cood_y_4', 'tg_cood_z_4', 'tg_cood_x_5', 'tg_cood_y_5', 'tg_cood_z_5']
Rows: 9
Columns: 19
$ statistic   <str> 'count', 'null_count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'
$ tg_cood_x_0 <f64> 43371.0, 0.0, -0.0006715892176898854, 0.09753700583172945, -2.057005512707679, -0.040399420711143996, -0.000568865584570144, 0.04032857916132482, 1.197693475530765
$ tg_cood_y_0 <f64> 43371.0, 0.0, -0.00043051178186256537, 0.061965987833959645, -2.5740922230432965, -0.022298936290583313, -0.0011371778367903962, 0.02090260048404809, 3.8222068163710263
$ tg_cood_z_0 <f64> 43371.0, 0.0, -4.1921114744582373e-05, 0.0401841568252421, -1.004058364214115, -0.01861453878059976, 0.00036592425384850667, 0.017990800193160003, 1.4501810662508412
$ tg_cood_x_1 <f64> 43371.0, 0.0, -0.0010784820030265154, 

0

## 特徴量

In [13]:
feature = utils.FeatureGBDT(
    prefix="ft_", n_components_depth_pca=16, random_state=RANDOM_STATE
)
# feature = utils.FeatureRidge(prefix="ft_")
feature.fit(tr_df, tr_depth_images, tr_optical_flow_images)

ft_df = feature.transform(tr_df, tr_depth_images, tr_optical_flow_images)
print(ft_df.columns)
print(ft_df.describe().glimpse())
tr_df = pl.concat([tr_df, ft_df], how="horizontal")

ft_df = feature.transform(ts_df, ts_depth_images, ts_optical_flow_images)
print(ft_df.columns)
print(ft_df.describe().glimpse())
ts_df = pl.concat([ts_df, ft_df], how="horizontal")

del ft_df
gc.collect()

['ft_vEgo', 'ft_aEgo', 'ft_steeringAngleDeg', 'ft_steeringTorque', 'ft_brake', 'ft_brakePressed', 'ft_gas', 'ft_gasPressed', 'ft_is_gearShifter_drive', 'ft_is_gearShifter_neutral', 'ft_is_gearShifter_park', 'ft_is_gearShifter_reverse', 'ft_leftBlinker', 'ft_rightBlinker', 'ft_prev_ft_vEgo', 'ft_prev_ft_aEgo', 'ft_prev_ft_steeringAngleDeg', 'ft_prev_ft_steeringTorque', 'ft_prev_ft_brake', 'ft_prev_ft_brakePressed', 'ft_prev_ft_gas', 'ft_prev_ft_gasPressed', 'ft_prev_ft_is_gearShifter_drive', 'ft_prev_ft_is_gearShifter_neutral', 'ft_prev_ft_is_gearShifter_park', 'ft_prev_ft_is_gearShifter_reverse', 'ft_prev_ft_leftBlinker', 'ft_prev_ft_rightBlinker', 'ft_next_ft_vEgo', 'ft_next_ft_aEgo', 'ft_next_ft_steeringAngleDeg', 'ft_next_ft_steeringTorque', 'ft_next_ft_brake', 'ft_next_ft_brakePressed', 'ft_next_ft_gas', 'ft_next_ft_gasPressed', 'ft_next_ft_is_gearShifter_drive', 'ft_next_ft_is_gearShifter_neutral', 'ft_next_ft_is_gearShifter_park', 'ft_next_ft_is_gearShifter_reverse', 'ft_next_ft_

0

In [14]:
del tr_depth_images, ts_depth_images
gc.collect()

del tr_optical_flow_images, ts_optical_flow_images
gc.collect()

0

In [15]:
tr_use_cols = (
    ["ID", "scene_id", "scene_dsec", "origin_idx"]
    + TARGET_COLS
    + BASE_PRED_COLS
    + feature.columns
    + target.columns
)
ts_use_cols = (
    ["ID", "scene_id", "scene_dsec", "origin_idx"] + BASE_PRED_COLS + feature.columns
)
tr_df = tr_df.select(tr_use_cols)
ts_df = ts_df.select(ts_use_cols)

## モデリング

In [16]:
N_SPLITS = 2

In [17]:
# model_params = {
#    "ridge": {
#        "alpha": 0.1,
#        "random_state": RANDOM_STATE,
#    }
# }
#
# fit_params = {}

model_params = {
    "gbdt": {
        "tree_method": "hist",
        "num_target": len(target.columns),
        "multi_strategy": "multi_output_tree",
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "learning_rate": 0.001,
        "max_depth": 5,
        "max_leaves": 2 ** (4 - 1),
        "subsample": 0.5,
        "colsample_bytree": 1.0,
        "nthread": 8,
        "seed": RANDOM_STATE,
        # "device": "cuda",
    }
}

fit_params = {
    "gbdt": {
        "num_boost_round": 100000,
        "early_stopping_rounds": 500,
    }
}

In [18]:
models, oof_preds = utils.train(
    model_class=utils.XgbModel,
    # model_class=utils.LgbModel,
    # model_class=utils.RidgeModel,
    model_params=model_params,
    fit_params=fit_params,
    df=tr_df,
    target_cols=target.columns,
    feature_cols=feature.columns,
    group_col="scene_id",
    n_splits=N_SPLITS,
)

-----------------
-----------------
Training fold 0...
train samples: 21685, valid samples: 21686
[0]	train-rmse:0.40776	valid-rmse:0.40639
[5000]	train-rmse:0.39501	valid-rmse:0.40528
[10000]	train-rmse:0.38496	valid-rmse:0.40494


KeyboardInterrupt: 

In [None]:
# utils.plot_lgb_importance(
#    sum([model.models for model in models], []), models[0].feature_names
# )

In [None]:
oof_preds = oof_preds.select(pl.all().name.prefix("pred_"))
pred_cols = oof_preds.columns

tr_df = pl.concat([tr_df, oof_preds], how="horizontal")
tr_df

## 評価

In [None]:
def calc_score(df: pl.DataFrame, pred_cols: list[str]):
    tg_cols = TARGET_COLS

    tg = df.select(tg_cols).to_numpy()
    pred = df.select(pred_cols).to_numpy()

    scores = np.abs(tg - pred).mean(axis=0)
    scores = {f"score_{col}": float(score) for col, score in zip(pred_cols, scores)}
    scores["avg"] = float(np.abs(tg - pred).mean())
    return scores


scores = calc_score(tr_df, pred_cols)
scores

In [None]:
utils.plot_calibration_curve(tr_df, pred_cols, n_bins=40)

In [None]:
if BASE_PRED_DIR is not None:
    # 差し引いていた基礎推定値を足して元のtarget, pred列に戻す
    tr_df = add_base_pred_to_target(tr_df, TARGET_COLS)
    tr_df = add_base_pred_to_target(tr_df, pred_cols)

In [None]:
if BASE_PRED_DIR is not None:
    scores = calc_score(tr_df, pred_cols)
    display(scores)

In [None]:
if BASE_PRED_DIR is not None:
    utils.plot_calibration_curve(tr_df, pred_cols, n_bins=40)

## oofを保存

In [None]:
def create_submission_csv(preds: pl.DataFrame, filename: str = "submission.csv"):
    submission_cols = TARGET_COLS

    # validate preds columns
    if len(preds.columns) != len(submission_cols):
        raise ValueError(
            f"preds columns must be {len(submission_cols)}, but got {len(preds.columns)}"
        )

    preds.columns = submission_cols
    preds.write_csv(filename)
    print(f"Submission file is created: {filename}")


# 元の順番に戻して保存
create_submission_csv(tr_df.sort("origin_idx").select(pred_cols), "oof_preds.csv")

## Submission

In [None]:
preds = utils.predict(
    models,
    ts_df,
    feature.columns,
    pred_cols=pred_cols,
)
pred_cols = preds.columns
ts_df = pl.concat([ts_df, preds], how="horizontal")

preds

In [None]:
if BASE_PRED_DIR is not None:
    # 差し引いていた基礎推定値を足して元のtarget, pred列に戻す
    ts_df = add_base_pred_to_target(ts_df, pred_cols)
    display(ts_df)

In [None]:
# 元の順番に戻す
ts_df = ts_df.sort("origin_idx")

In [None]:
create_submission_csv(ts_df.select(pred_cols), "submission.csv")