# series oof df
series_idごとのoof scoreをチェック
errorがどういう波形に対して発生しているかを確認する

In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')


In [2]:
import sys
sys.path.append("/kaggle/src/dss_utils")

from dss_metrics import score


# Config

In [3]:
output_dir = os.path.join("/kaggle", "working", "_oof")
# exp_name = "exp006_addlayer"
# exp_name = "debug"
exp_name = "exp017_inputtargettd_flip_epoch10"


# fn

In [9]:
# postprocess_fn
# series_idでgroupbyして、class_predに対して対象の列のデータから前のN個の列までのデータの平均をとる
import torch
import torch.nn as nn

# 1step 0.5secで30minなら60*30=1800step
def postprocess_fn(df, N=301, maxpool_kernel_size=41, maxpool_stride=1):
    df = df[df["second"]==0].copy()
    df["class_pred_beforemean"] = df.groupby("series_id")["class_pred"].apply(lambda x: x.rolling(N, min_periods=1).mean())
    df["class_pred_aftermean"] = df.groupby("series_id")["class_pred"].apply(lambda x: x[::-1].rolling(N, min_periods=1).mean()[::-1])
    df["event_pred"] = df["class_pred_beforemean"] - df["class_pred_aftermean"]

    # 入力サイズと出力サイズが一致するようにpaddingを調整
    maxpool_padding = int((maxpool_kernel_size - maxpool_stride) / 2)
    # maxpoolしてピーク検出
    max_pooling = nn.MaxPool1d(maxpool_kernel_size, stride=maxpool_stride, padding=maxpool_padding)
    event_pred = df["event_pred"].values
    event_pred = torch.tensor(event_pred).unsqueeze(0)
    pooled_event_pred = max_pooling(np.abs(event_pred)).squeeze(0).numpy()
    event_pred = event_pred.squeeze(0).numpy()
    # peakのところだけ残すmaskを作成
    peak_event_pred_mask = np.where(pooled_event_pred == np.abs(event_pred), 1, 0)
    peak_event_pred = event_pred * peak_event_pred_mask
    df["event_pred"] = peak_event_pred
    df = df.drop(["class_pred_beforemean", "class_pred_aftermean"], axis=1)
    return df


def make_submission_df(df, threshold=0.01):
    df = df[["series_id", "step", "event_pred"]].copy()
    # thresholdより大きいときは1,-thresholdより小さいときは-1,それ以外は0
    df["event"] = df["event_pred"].apply(lambda x: 1 if x > threshold else -1 if x < -threshold else 0)
    df = df[df["event"] != 0].copy()
    df["event"] = df["event"].replace({1: "wakeup", -1: "onset"})
    df["score"] = df["event_pred"].apply(lambda x: np.clip(np.abs(x), 0.0, 1.0))
    return df


---
# make sub df

In [10]:
sub_df = pd.DataFrame()
for i in range(5):
    print("fold", i)
    df = pd.read_parquet(os.path.join(output_dir, exp_name, f"oof_df_fold{i}.parquet"))
    for col in df.columns:
        # 64bit float -> 16bit float
        if df[col].dtype == np.float64:
            df[col] = df[col].astype(np.float16)
        # 64bit int -> 16bit int
        elif df[col].dtype == np.int64:
            df[col] = df[col].astype(np.int16)
    print("postprocess_fn")
    df = postprocess_fn(df, N=11, maxpool_kernel_size=3, maxpool_stride=1)
    sub_df_ = make_submission_df(df, threshold=0.01)
    sub_df_["step"] = sub_df_["step"].astype(np.float64)
    event = event_df[event_df["series_id"].isin(df["series_id"].unique())].copy().reset_index(drop=True)
    print("score", score(event, sub_df_))

    sub_df = pd.concat([sub_df, sub_df_], axis=0)


fold 0
postprocess_fn
score 0.022744527326924117
fold 1
postprocess_fn
score 0.02481117984811613
fold 2


KeyboardInterrupt: 

In [None]:
del df
import gc
gc.collect()


In [None]:
sub_df.to_csv(os.path.join(output_dir, exp_name, "sub_df.csv"), index=False)


---
# score check

In [None]:
sub_df = pd.read_csv(os.path.join(output_dir, exp_name, "sub_df.csv"))


In [None]:
series_id_list =[]
score_list = []
detected_event_num_list = []
for series_id in event_df["series_id"].unique():
    scoring_event_df = event_df[event_df["series_id"] == series_id].copy()
    scoring_sub_df = sub_df[sub_df["series_id"] == series_id].copy()
    if len(scoring_sub_df) == 0 or len(scoring_event_df) == 0:
        series_id_list.append(series_id)
        series_score = 0
        score_list.append(series_score)
        detected_event_num_list.append(len(scoring_sub_df))
        print(series_id, series_score, len(scoring_sub_df), len(scoring_event_df))
    else:
        series_score = score(scoring_event_df, scoring_sub_df)
        series_id_list.append(series_id)
        score_list.append(series_score)
        detected_event_num_list.append(len(scoring_sub_df))
        print(series_id, series_score)


In [None]:
score_df = pd.DataFrame({
    "series_id": series_id_list,
    "score": score_list,
    "detected_event_num": detected_event_num_list
})


In [None]:
score_df.to_csv(os.path.join(output_dir, exp_name, "score_df.csv"), index=False)


In [None]:
score_df = score_df.sort_values("score", ascending=False).reset_index(drop=True)
display(score_df.head(3))
score_df["score"].hist(bins=100)
