# check oof df

In [3]:
import os
import gc
import yaml
import argparse

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch

import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append("/kaggle/src/dss_utils")
sys.path.append("/kaggle/src/exp")
sys.path.append("/kaggle/src/model")
sys.path.append("/kaggle/src/data")
from dss_metrics import score
from training_loop import get_valid_values_dict, concat_valid_input_info, get_oof_df, get_key_df
from dss_model import get_model
from dss_dataloader import get_loader


In [4]:
output_dir = "/kaggle/working"
exp_name = "exp012_targetdownsample_epoch10"

folds = [0, 1, 2, 3, 4]
oof_df = pd.DataFrame()
for i in folds:
    print("fold", i)
    df = pd.read_parquet(os.path.join(output_dir, "_oof", exp_name, f"raw_oof_df_fold{i}.parquet"))
    oof_df = pd.concat([oof_df, df], axis=0)


fold 0
fold 1
fold 2
fold 3
fold 4


In [5]:
oof_df = oof_df.sort_values(["series_id", "step"])
display(oof_df)


Unnamed: 0,series_id,step,minute,second,class_pred,class_target
0,038441c925bb,3240.0,0,0,0.008696,0.0
1,038441c925bb,3241.0,0,5,-1.000000,-1.0
2,038441c925bb,3242.0,0,10,-1.000000,-1.0
3,038441c925bb,3243.0,0,15,-1.000000,-1.0
4,038441c925bb,3244.0,0,20,-1.000000,-1.0
...,...,...,...,...,...,...
16555675,fe90110788d2,589315.0,59,35,-1.000000,-1.0
16555676,fe90110788d2,589316.0,59,40,-1.000000,-1.0
16555677,fe90110788d2,589317.0,59,45,-1.000000,-1.0
16555678,fe90110788d2,589318.0,59,50,-1.000000,-1.0


In [6]:
# postprocess_fn
# series_idでgroupbyして、class_predに対して対象の列のデータから前のN個の列までのデータの平均をとる
import torch
import torch.nn as nn


def postprocess_notrolling(df):
    df = df.copy()
    df = df[df["second"]==0].reset_index(drop=True)
    # 一つ前のclass_predを取得
    df["class_pred_before"] = df.groupby("series_id")["class_pred"].shift(1)
    # 一つ後のclass_predを取得
    df["class_pred_after"] = df.groupby("series_id")["class_pred"].shift(-1)
    df["event_pred"] = df["class_pred_before"] - df["class_pred_after"]
    not_predicted_mask = (df["class_pred"] != -1).astype(int)
    df["event_pred"] = df["event_pred"] * not_predicted_mask
    df = df.drop(["class_pred_before", "class_pred_after"], axis=1)
    return df

# 1step 0.5secで30minなら60*30=1800step
def postprocess_downsample_fn(df, N=3, maxpool_kernel_size=3, maxpool_stride=1):
    df = df.copy()
    df = df[df["second"]==0].reset_index(drop=True)
    df["class_pred_beforemean"] = df.groupby("series_id")["class_pred"].apply(lambda x: x.rolling(N, min_periods=1).mean())
    df["class_pred_aftermean"] = df.groupby("series_id")["class_pred"].apply(lambda x: x[::-1].rolling(N, min_periods=1).mean()[::-1])
    df["event_pred"] = df["class_pred_beforemean"] - df["class_pred_aftermean"]
    not_predicted_mask = (df["class_pred"] != -1).astype(int)
    df["event_pred"] = df["event_pred"] * not_predicted_mask

    # 入力サイズと出力サイズが一致するようにpaddingを調整
    maxpool_padding = int((maxpool_kernel_size - maxpool_stride) / 2)
    # maxpoolしてピーク検出
    max_pooling = nn.MaxPool1d(maxpool_kernel_size, stride=maxpool_stride, padding=maxpool_padding)
    event_pred = df["event_pred"].values
    event_pred = torch.tensor(event_pred).unsqueeze(0)
    pooled_event_pred = max_pooling(np.abs(event_pred)).squeeze(0).numpy()
    event_pred = event_pred.squeeze(0).numpy()
    # peakのところだけ残すmaskを作成
    peak_event_pred_mask = np.where(pooled_event_pred == np.abs(event_pred), 1, 0)
    peak_event_pred = event_pred * peak_event_pred_mask
    df["event_pred"] = peak_event_pred
    df = df.drop(["class_pred_beforemean", "class_pred_aftermean"], axis=1)
    return df

def postprocess_downsample_notmaxpool(df, N=11):
    df = df.copy()
    df["class_pred_beforemean"] = df.groupby("series_id")["class_pred"].apply(lambda x: x.rolling(N, min_periods=1).mean())
    df["class_pred_aftermean"] = df.groupby("series_id")["class_pred"].apply(lambda x: x[::-1].rolling(N, min_periods=1).mean()[::-1])
    df["event_pred"] = df["class_pred_beforemean"] - df["class_pred_aftermean"]
    df = df.drop(["class_pred_beforemean", "class_pred_aftermean"], axis=1)
    return df

def make_submission_df(df, threshold=0.1):
    df = df[["series_id", "step", "event_pred", "minute"]].copy()
    # thresholdより大きいときは1,-thresholdより小さいときは-1,それ以外は0
    df["event"] = df["event_pred"].apply(lambda x: 1 if x > threshold else -1 if x < -threshold else 0)
    df = df[df["event"] != 0].copy()
    df["event"] = df["event"].replace({1: "wakeup", -1: "onset"})
    df["score"] = df["event_pred"].apply(lambda x: np.clip(np.abs(x), 0.0, 1.0))
    return df


def happentime_process(df):
    freq_happen_time = np.array(
        [0, 3, 7, 11, 15, 18, 22, 26, 30, 33, 37, 41, 45, 48, 52, 56])
    # minuteがfreq_happen_timeのどれかに一番近いものを取得
    df["happen_time"] = df["minute"].apply(
        lambda x: freq_happen_time[np.argmin(np.abs(freq_happen_time - x))])
    df["diff_minute"] = df["minute"] - df["happen_time"]
    # df["diff_step"] = df["diff_minute"] * 12
    # # diff_stepずらしたものを増やす
    # diff_df = df[df["diff_step"] != 0].copy()
    # diff_df["step"] = diff_df["step"] + diff_df["diff_step"]
    # diff_df["score"] = diff_df["score"] - 0.01
    # df = df[df["diff_step"] == 0].copy()
    # df.loc[]["score"] = np.clip(df["score"] + 0.1, 0.0, 1.0)
    # df = pd.concat([df, diff_df], axis=0)
    # diff_minuteが0のときはscoreを0.1増やす
    df.loc[df["diff_minute"] == 0, "score"] = np.clip(df.loc[df["diff_minute"] == 0, "score"] + 0.1, 0.0, 1.0)
    df = df.sort_values(["series_id", "step"])

    return df


In [7]:
train_event_df = pd.read_csv("/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv")
train_event_df = train_event_df[train_event_df["step"].notnull()].copy()
# train_event_df = train_event_df[train_event_df["series_id"].isin(oof_df["series_id"].unique())].copy()
train_event_df = train_event_df.reset_index(drop=True)
display(train_event_df.head())


Unnamed: 0,series_id,night,event,step,timestamp
0,038441c925bb,1,onset,4992.0,2018-08-14T22:26:00-0400
1,038441c925bb,1,wakeup,10932.0,2018-08-15T06:41:00-0400
2,038441c925bb,2,onset,20244.0,2018-08-15T19:37:00-0400
3,038441c925bb,2,wakeup,27492.0,2018-08-16T05:41:00-0400
4,038441c925bb,3,onset,39996.0,2018-08-16T23:03:00-0400


In [8]:
df = postprocess_notrolling(oof_df)
sub_df = make_submission_df(df, threshold=0.1)
display(sub_df.head())


Unnamed: 0,series_id,step,event_pred,minute,event,score
144,038441c925bb,4968.0,-0.157297,24,onset,0.157297
145,038441c925bb,4980.0,-0.36006,25,onset,0.36006
146,038441c925bb,4992.0,-0.59131,26,onset,0.59131
147,038441c925bb,5004.0,-0.451895,27,onset,0.451895
148,038441c925bb,5016.0,-0.159135,28,onset,0.159135


In [9]:
sub_df = happentime_process(sub_df)
display(sub_df.head())


Unnamed: 0,series_id,step,event_pred,minute,event,score,happen_time,diff_minute
144,038441c925bb,4968.0,-0.157297,24,onset,0.157297,22,2
145,038441c925bb,4980.0,-0.36006,25,onset,0.36006,26,-1
146,038441c925bb,4992.0,-0.59131,26,onset,0.69131,26,0
147,038441c925bb,5004.0,-0.451895,27,onset,0.451895,26,1
148,038441c925bb,5016.0,-0.159135,28,onset,0.159135,26,2


In [10]:
%%time
max_kernel_size = 3
tmp_score = 0
for average_size_ in range(3, 21, 2):
    print("average_size", average_size_, "max_kernel_size", max_kernel_size)
    df = postprocess_downsample_fn(oof_df, N=average_size_, maxpool_kernel_size=max_kernel_size)
    sub_df = make_submission_df(df, threshold=0.1)
    sub_df = happentime_process(sub_df)
    if len(sub_df) == 0:
        print("event not detected")
        continue
    score_ = score(train_event_df, sub_df)
    print(score_)
    if tmp_score > score_:
        break
    tmp_score = score_
    average_size = average_size_


average_size 3 max_kernel_size 3
0.7019170609498475
average_size 5 max_kernel_size 3
0.7197331043141444
average_size 7 max_kernel_size 3
0.7259562198697992
average_size 9 max_kernel_size 3
0.7285337073442942
average_size 11 max_kernel_size 3
0.7285921741747393
average_size 13 max_kernel_size 3
0.7270999220500285
CPU times: user 1min 5s, sys: 6.98 s, total: 1min 12s
Wall time: 1min 9s


In [11]:
tmp_score = 0
max_kernel_size_ = max_kernel_size
for max_kernel_size in range(3, 31, 2):
    print("average_size", average_size, "max_kernel_size", max_kernel_size)
    df = postprocess_downsample_fn(oof_df, N=average_size, maxpool_kernel_size=max_kernel_size)
    sub_df = make_submission_df(df, threshold=0.1)
    sub_df = happentime_process(sub_df)
    if len(sub_df) == 0:
        print("event not detected")
        continue
    score_ = score(train_event_df, sub_df)
    print(score_)
    if tmp_score > score_:
        break
    tmp_score = score_
    max_kernel_size_ = max_kernel_size


average_size 11 max_kernel_size 3
0.7285921741747393
average_size 11 max_kernel_size 5
0.7283212563006136


In [12]:
df = postprocess_downsample_fn(oof_df, N=average_size, maxpool_kernel_size=max_kernel_size)
sub_df = make_submission_df(df, threshold=0.01)
sub_df = happentime_process(sub_df)
print(average_size, max_kernel_size)
print(len(sub_df))
if len(sub_df) == 0:
    print("event not detected")
    score_ = 0
else:
    score_ = score(train_event_df, sub_df)
print(score_)


11 5
32896
0.7429961645200627


"exp012_targetdownsample_epoch10"


average_size 9 max_kernel_size 3
0.7310038029952944

thr=0.01 -> 0.7469626949452008

---

exp_name = "exp012_targetdownsample_epoch5"

average_size 11 max_kernel_size 5
0.7325993414687024

0.7462916305317064