In [13]:
import os
import gc
import sys
import argparse
import yaml
import time
import shutil

import pandas as pd
import polars as pl


In [14]:
sys.path.append("/kaggle/src/exp")
sys.path.append("/kaggle/src/data")
sys.path.append("/kaggle/src/model")
sys.path.append("/kaggle/src/dss_utils")
# sys.path.append("/kaggle/src/submission")


import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from dss_dataloader import get_loader
from dss_model import get_model
from training_loop import concat_valid_input_info, get_valid_values_dict, seed_everything


In [15]:
# exp_dir = "/kaggle/input/dss-exps"
exp_dir = "/kaggle/working"
exp_name = "exp020_dense_chh_skffold_epoch30" # exp020_dense_chh_skffold_epoch30からbest modelを保存するようになった
# series_df_path = "/kaggle/input/dss-train-someunique-series/train_series.parquet"
series_df_path = "/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet"
config_path = os.path.join(exp_dir, exp_name, "config.yaml")

tmp_file_path = "/kaggle/working/tmp"
os.makedirs(tmp_file_path, exist_ok=True)

class_pred_roll_mean_num = 11
class_pred_maxpool_num = 3

dataframe_split_num = 10


# preprocess

In [16]:
def pl_datetime_preprocess(train_series_):
    train_series_ = train_series_.with_columns(
        pl.col("timestamp").str.to_datetime().dt.replace_time_zone(None))
    train_series_ = train_series_.with_columns(
        pl.col("timestamp").dt.second().cast(pl.Int32).alias("second"))
    train_series_ = train_series_.with_columns(
        pl.col("timestamp").dt.minute().cast(pl.Int32).alias("minute"))
    train_series_ = train_series_.with_columns(
        pl.col("timestamp").dt.date().cast(str).alias("date"))
    return train_series_


def preprocess_input(train_series_: pd.DataFrame) -> pd.DataFrame:
    train_series_ = train_series_.drop(columns=["timestamp"], axis=1)
    # anglezとenmoのrolling meanとrolling stdを取る
    print("get anglez and enmo rolling mean and std")
    for roll_num in [36, 60]:  # 雰囲気で選んだ
        train_series_[f"anglez_mean_{roll_num}"] = (
            train_series_.groupby("series_id")["anglez"].rolling(
                roll_num, center=True).mean().reset_index(0, drop=True))
        train_series_[f"anglez_std_{roll_num}"] = (
            train_series_.groupby("series_id")["anglez"].rolling(
                roll_num, center=True).std().reset_index(0, drop=True))
        train_series_[f"anglez_mean_{roll_num}"] = train_series_[
            f"anglez_mean_{roll_num}"].fillna(0)
        train_series_[f"anglez_std_{roll_num}"] = train_series_[
            f"anglez_std_{roll_num}"].fillna(0)    
    return train_series_


def set_seriesdatekey(train_series_: pd.DataFrame) -> pd.DataFrame:
    train_series_["series_date_key"] = (
        train_series_["series_id"].astype(str) + "_" + train_series_["date"].astype(str)
    )
    return train_series_

def label_encode_series_date_key(train_series_: pd.DataFrame) -> pd.DataFrame:
    from sklearn.preprocessing import LabelEncoder

    le = LabelEncoder()
    train_series_["series_date_key_str"] = train_series_["series_date_key"].astype(str)
    train_series_["series_date_key"] = le.fit_transform(
        train_series_["series_date_key_str"]
    )
    train_series_["series_date_key"] = train_series_["series_date_key"].astype("int64")
    return train_series_


# post process

In [17]:
# 1step 0.5secで30minなら60*30=1800step?
# metric的にいっぱい検出してもいい？とりあえず小さめ
def detect_event_from_downsample_classpred(df,tmp_file_path
                                           N=class_pred_roll_mean_num,
                                           maxpool_kernel_size=class_pred_maxpool_num,
                                           maxpool_stride=1
                                           ):
    df = df[df["second"] == 0].reset_index(drop=True)
    df = df.sort_values(["series_id", "step"]).reset_index(drop=True)
    df["class_pred_beforemean"] = df.groupby("series_id")["class_pred"].apply(
        lambda x: x.rolling(N, min_periods=1).mean()
    ).reset_index(drop=True)
    df["class_pred_aftermean"] = df.groupby("series_id")["class_pred"].apply(
        lambda x: x[::-1].rolling(N, min_periods=1).mean()[::-1]
    ).reset_index(drop=True)

    df["event_pred"] = df["class_pred_beforemean"] - df["class_pred_aftermean"]
    not_predicted_mask = (df["class_pred"] != -1).astype(int)
    df["event_pred"] = df["event_pred"] * not_predicted_mask

    # 入力サイズと出力サイズが一致するようにpaddingを調整
    maxpool_padding = int((maxpool_kernel_size - maxpool_stride) / 2)
    # maxpoolしてピーク検出
    max_pooling = nn.MaxPool1d(maxpool_kernel_size,
                               stride=maxpool_stride,
                               padding=maxpool_padding)
    event_pred = df["event_pred"].values
    event_pred = torch.tensor(event_pred).unsqueeze(0)
    pooled_event_pred = max_pooling(np.abs(event_pred)).squeeze(0).numpy()
    event_pred = event_pred.squeeze(0).numpy()
    # peakのところだけ残すmaskを作成
    peak_event_pred_mask = np.where(pooled_event_pred == np.abs(event_pred), 1, 0)
    peak_event_pred = event_pred * peak_event_pred_mask
    df["event_pred"] = peak_event_pred
    df["onset_pred"] = np.clip(-df["event_pred"], 0, 1)
    df["wakeup_pred"] = np.clip(df["event_pred"], 0, 1)
    df = df.drop(["class_pred_beforemean", "class_pred_aftermean"], axis=1)
    return df


def make_submission_df(df, threshold=0.01):
    df = df[["series_id", "step", "event_pred"]]
    # thresholdより大きいときは1,-thresholdより小さいときは-1,それ以外は0
    df["event"] = df["event_pred"].apply(
        lambda x: 1 if x > threshold else -1 if x < -threshold else 0
    )
    df = df[df["event"] != 0]
    df["event"] = df["event"].replace({1: "wakeup", -1: "onset"})
    df["score"] = df["event_pred"].apply(lambda x: np.clip(np.abs(x), 0.0, 1.0))
    return df


In [18]:
def get_valid_values_dict(
    class_values: torch.Tensor,
    validation_dict: dict,
    mode: str = "preds",
) -> dict:
    class_values = class_values.detach().cpu().numpy()
    if len(validation_dict[f"class_{mode}"]) == 0:
        validation_dict[f"class_{mode}"] = class_values
    else:
        validation_dict[f"class_{mode}"] = np.concatenate(
            [validation_dict[f"class_{mode}"], class_values], axis=0)
    return validation_dict

def concat_valid_input_info(valid_input_info: dict, input_info: dict) -> dict:
    if len(valid_input_info["series_date_key"]) == 0:
        valid_input_info["series_date_key"] = input_info["series_date_key"].numpy()
        valid_input_info["start_step"] = input_info["start_step"].numpy()
        valid_input_info["end_step"] = input_info["end_step"].numpy()
    else:
        valid_input_info["series_date_key"] = np.concatenate([
            valid_input_info["series_date_key"], input_info["series_date_key"].numpy()
        ],axis=0)
        valid_input_info["start_step"] = np.concatenate(
            [valid_input_info["start_step"], input_info["start_step"].numpy()], axis=0)
        valid_input_info["end_step"] = np.concatenate(
            [valid_input_info["end_step"], input_info["end_step"].numpy()], axis=0)
    return valid_input_info

def get_pred_df(
    input_info_dict: dict,
    preds_dict: dict,
    pred_df: pd.DataFrame,
    fold: int,
) -> pd.DataFrame:
    start_time = time.time()
    print("creating oof_df", end=" ... ")
    if "class_pred" in pred_df.columns:
        pred_df = pred_df.drop(["class_pred"], axis=1)
    series_date_key_list = []
    class_pred_list, steps_list = [],  []

    for idx, (series_date_key, start_step, end_step) in enumerate(
            zip(
                input_info_dict["series_date_key"],
                input_info_dict["start_step"],
                input_info_dict["end_step"],
            )):
        if not isinstance(series_date_key, np.int64):
            series_date_key = series_date_key.numpy()
        # preds targets shape: [batch, ch, data_length]
        class_pred = preds_dict["class_preds"][idx]
        steps = range(start_step, end_step+1, 12)
        series_date_data_num = len(steps)
        if series_date_data_num < len(class_pred[0]):
            class_pred = class_pred[0, :series_date_data_num]
        elif series_date_data_num > len(class_pred[0]):
            padding_num = series_date_data_num - len(class_pred[0])
            class_pred = np.concatenate(
                [class_pred[0], -1 * np.ones(padding_num)], axis=0)
        else:
            class_pred = class_pred[0]
        if not (len(class_pred) == len(steps)):
            print("len(class_pred)", len(class_pred))
            print("len(steps)", len(steps))
            raise ValueError("preds and step length is not same")
        class_pred_list.extend(class_pred)
        steps_list.extend(steps)
        series_date_key_list.extend([series_date_key] * len(steps))
    pred_col_name = f"class_pred_fold{fold}"
    oof_pred_target_df = pd.DataFrame({
        "series_date_key": series_date_key_list,
        "step": steps_list,
        pred_col_name: class_pred_list,
    })
    merge_start_time = time.time()
    print("merging oof_df")
    oof_pred_target_df["series_date_key"] = oof_pred_target_df["series_date_key"].astype("int64")
    pred_df = pd.merge(pred_df,
                       oof_pred_target_df,
                       on=["series_date_key", "step"],
                       how="left")
    pred_df[pred_col_name] = pred_df[pred_col_name].fillna(-1)
    merge_elapsed = int(time.time() - merge_start_time) / 60
    print("merge elapsed time: {:.2f} min".format(merge_elapsed))
    elapsed = int(time.time() - start_time) / 60
    print(f" >> oof_df created. elapsed time: {elapsed:.2f} min")
    return pred_df      


# infer

In [19]:
def split_data(series_df, tmp_file_path, split_num=3):
    series_id_unique = series_df["series_id"].unique()
    if len(series_id_unique) <= split_num:
        df_path = os.path.join(tmp_file_path, f"series_df_split_0.parquet")
        series_df.to_parquet(df_path, index=False)
        split_num = 1
    else:
        for idx in range(split_num):
            if idx == split_num-1:
                series_id_split = series_id_unique[idx*(len(series_id_unique) // split_num):]
            else:
                series_id_split = series_id_unique[
                    idx * (len(series_id_unique) // split_num) :
                    (idx + 1) * (len(series_id_unique) // split_num)
                ]
            series_df_split = series_df[series_df["series_id"].isin(series_id_split)].reset_index(drop=True)
            print(f"series_df_split_{idx} data num : {len(series_df_split)}")
            split_df_path = os.path.join(tmp_file_path, f"series_df_split_{idx}.parquet")
            series_df_split.to_parquet(split_df_path, index=False)
            print(f"split_df is saved as {split_df_path}")
                
    return split_num


In [20]:
def predict(CFG, model, infer_loader):
    model.eval()

    infer_predictions = {"class_preds": np.empty(0)}
    infer_input_info = {"series_date_key": [], "start_step": [], "end_step": []}

    for inputs, input_info_dict in infer_loader:
        inputs = inputs.to(CFG.device, non_blocking=True).float()
        with torch.no_grad():
            preds = model(inputs)

        infer_predictions = get_valid_values_dict(preds, infer_predictions, mode="preds")
        infer_input_info = concat_valid_input_info(infer_input_info, input_info_dict)

    del inputs, preds
    gc.collect()
    torch.cuda.empty_cache()
    return infer_predictions, infer_input_info  


def inference(CFG, exp_dir, exp_name, series_df_path, tmp_file_path, split_num=3):
    infer_start_time = time.time()
    # series_df = pd.read_parquet(series_df_path)
    # split_num = split_data(series_df, tmp_file_path, split_num)
    # del series_df
    # gc.collect()
    for idx in range(split_num):
        print("split idx:", idx)
        series_df = pl.read_parquet(
            os.path.join(tmp_file_path, f"series_df_split_{idx}.parquet")
        )
        if len(series_df)==0:
            sub_df_split = pd.DataFrame({ "series_id":[],
                                          "step":[],
                                          "event":[],
                                          "score":[]
                                        })
            sub_df_split_path = os.path.join(tmp_file_path, f"sub_df_split_{idx}.csv")
            sub_df_split.to_csv(sub_df_split_path, index=False)
            continue
        series_df = pl_datetime_preprocess(series_df)
        series_df = series_df.to_pandas()
        series_df = preprocess_input(series_df)
        series_df = set_seriesdatekey(series_df)
        series_df = label_encode_series_date_key(series_df)
        key_df = series_df[["series_date_key", "series_date_key_str"]].drop_duplicates()
        key_df = key_df.reset_index(drop=True)


        key_df["series_id"], key_df["date"] = key_df["series_date_key_str"].str.split("_", expand=True)
        key_df = key_df.drop(columns=["series_date_key_str"], axis=1)

        pred_df = series_df[["series_id", "series_date_key", "step", "second", "minute"]].copy()
        for fold in CFG.folds:
            print(f"-- fold{fold} inference start --")
            # set model & learning fn
            model = get_model(CFG)
            model_path = os.path.join(exp_dir, exp_name, f"fold{fold}_best_model.pth")
            print("model loading", model_path)
            model.load_state_dict(torch.load(model_path))
            model = model.to(CFG.device)
            # separate train/valid data
            infer_loader = get_loader(CFG, key_df, series_df, mode="test")
            infer_preds, infer_input_dict = predict(CFG, model, infer_loader)
            print(f"split[{idx}] fold[{fold}] prediction finished.")
            pred_df = get_pred_df(
                infer_input_dict,
                infer_preds,
                pred_df,
                fold,
            )
            del infer_preds, infer_input_dict, infer_loader, model
            gc.collect()
            torch.cuda.empty_cache()
        
        pred_df["class_pred"] = pred_df[[f"class_pred_fold{fold}" for fold in CFG.folds]].mean(axis=1)
        pred_df = pred_df.drop(columns=[f"class_pred_fold{fold}" for fold in CFG.folds])
        pred_df = detect_event_from_downsample_classpred(pred_df)
        pred_df.to_csv(os.path.join(tmp_file_path, f"pred_df_split_{idx}.csv"))

        sub_df_split = make_submission_df(pred_df)
        sub_df_split = sub_df_split.drop("event_pred", axis=1)
        sub_df_split = sub_df_split.reset_index(drop=True)
        print(f"sub_df_split_{idx} data num : {len(sub_df_split)}")
        sub_df_split_path = os.path.join(tmp_file_path, f"sub_df_split_{idx}.csv")
        sub_df_split.to_csv(sub_df_split_path, index=False)
        print(f"sub_df_split is saved as {sub_df_split_path}")
        del sub_df_split, pred_df, series_df, key_df
        gc.collect()
        torch.cuda.empty_cache()

    sub_df = pd.DataFrame()
    for idx in range(split_num):
        if idx==0:
            sub_df = pd.read_csv(os.path.join(tmp_file_path, f"sub_df_split_{idx}.csv"))
        else:
            sub_df_tmp = pd.read_csv(os.path.join(tmp_file_path, f"sub_df_split_{idx}.csv"))
            if len(sub_df_tmp) > 0:
                sub_df = pd.concat([sub_df, sub_df_tmp], axis=0)       
            del sub_df_tmp
    gc.collect()
    torch.cuda.empty_cache()
    sub_df = sub_df.reset_index(drop=True)
    print(f"sub_df data num : {len(sub_df)}")
    return sub_df


In [21]:
config = yaml.load(open(config_path, "r"), Loader=yaml.SafeLoader)
config = argparse.Namespace(**config)
print(config)
seed_everything(config.seed)
print("--")
print("infer start")
sub_df = inference(config, exp_dir, exp_name, series_df_path, tmp_file_path, split_num=dataframe_split_num)


Namespace(T_0=30, T_mult=1, ave_kernel_size=301, batch_size=64, class_loss_weight=1.0, class_output_channels=1, competition_dir='/kaggle/input/child-mind-institute-detect-sleep-states', competition_name='dss', device='cuda', embedding_base_channels=16, eta_min=1e-09, event_df='/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv', event_loss_weight=1.0, event_output_channels=2, exp_category='earlysave', exp_dir='/kaggle/working/exp020_dense_chh_skffold_epoch30', exp_name='exp020_dense_chh_skffold_epoch30', folds=[0, 1, 2, 3, 4], group_key='series_id', input_channels=6, input_dir='/kaggle/input', key_df='/kaggle/input/datakey_unique_non_null.csv', logger_path='/kaggle/working/exp020_dense_chh_skffold_epoch30/train.log', lr=0.001, maxpool_kernel_size=11, model_type='input_target_downsample_dense', n_epoch=30, n_folds=5, num_workers=2, output_channels=2, output_dir='/kaggle/working', print_freq=50, pseudo_weight_exp='exp003', seed=42, series_df='/kaggle/input/targetdown

get anglez and enmo rolling mean and std
-- fold0 inference start --
model type =  input_target_downsample_dense
model loading /kaggle/working/exp020_dense_chh_skffold_epoch30/fold0_best_model.pth
split[0] fold[0] prediction finished.
creating oof_df ... merging oof_df
merge elapsed time: 0.03 min
 >> oof_df created. elapsed time: 0.05 min
-- fold1 inference start --
model type =  input_target_downsample_dense
model loading /kaggle/working/exp020_dense_chh_skffold_epoch30/fold1_best_model.pth
split[0] fold[1] prediction finished.
creating oof_df ... merging oof_df
merge elapsed time: 0.03 min
 >> oof_df created. elapsed time: 0.05 min
-- fold2 inference start --
model type =  input_target_downsample_dense
model loading /kaggle/working/exp020_dense_chh_skffold_epoch30/fold2_best_model.pth
split[0] fold[2] prediction finished.
creating oof_df ... merging oof_df
merge elapsed time: 0.03 min
 >> oof_df created. elapsed time: 0.05 min
-- fold3 inference start --
model type =  input_target_d

In [22]:
sub_df["row_id"] = range(len(sub_df))
sub_df = sub_df[["row_id", "series_id", "step", "event", "score"]]
if len(sub_df) == 0:
    sub_df = pd.read_csv("/kaggle/input/child-mind-institute-detect-sleep-states/sample_submission.csv")
sub_df = sub_df.sort_values(["series_id", "step"]).reset_index(drop=True)
sub_df["row_id"] = range(len(sub_df))
sub_df.to_csv("submission.csv", index=False)
display(sub_df)


Unnamed: 0,row_id,series_id,step,event,score
0,0,038441c925bb,0,wakeup,0.047167
1,1,038441c925bb,96,wakeup,0.019387
2,2,038441c925bb,144,wakeup,0.024401
3,3,038441c925bb,252,wakeup,0.015829
4,4,038441c925bb,444,wakeup,0.033743
...,...,...,...,...,...
99278,99278,fe90110788d2,589272,onset,0.013174
99279,99279,fe90110788d2,589344,wakeup,0.015000
99280,99280,fe90110788d2,591576,onset,0.034129
99281,99281,fe90110788d2,591720,onset,0.012993


In [25]:
sub_df["series_id"].nunique()


277

In [26]:
event_df = pd.read_csv("/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv")
print(event_df["series_id"].nunique())
event_df = event_df.dropna()
event_df = event_df[event_df["series_id"].isin(sub_df["series_id"].unique())]
from dss_metrics import score

print(score(event_df, sub_df))


277
0.7417883515684862
