In [1]:
import os
import datetime
import numpy as np
import pandas as pd
import polars as pl

import matplotlib.pyplot as plt

import plotly.graph_objects as go
from plotly.subplots import make_subplots

from tqdm.notebook import tqdm

from sklearn.metrics.pairwise import cosine_similarity 

import warnings
warnings.filterwarnings('ignore')


# functions

In [2]:
def get_date(df_):
    df_ = df_.with_columns(pl.col("timestamp").str.to_datetime().dt.replace_time_zone(None))
    df_ = df_.with_columns(pl.col("timestamp").fill_null(pl.col("timestamp").shift(1)+datetime.timedelta(days=1)))
    df_ = df_.with_columns(pl.col("timestamp").fill_null(pl.col("timestamp").shift(-1)+datetime.timedelta(days=-1)))
    df_ = df_.with_columns(pl.col("timestamp").dt.date().cast(str).alias("date"))
    return df_


In [3]:
def set_seriesdate_key(df_):
    df_ = df_.with_columns(pl.concat_str([pl.col("series_id"), pl.col("date").cast(str)], separator="_").alias("key"))
    return df_


In [4]:
def detect_similarity_data(anglez, before_date_anglez, cut_window_num=4, only_cos_sim_thr=0.995):
    window_size = int(anglez.shape[1]/cut_window_num)
    count = 0
    for window_idx in range(cut_window_num):
        anglez_window = anglez[:,window_idx*window_size:(window_idx+1)*window_size]
        before_date_anglez_window = before_date_anglez[:, window_idx*window_size:(window_idx+1)*window_size]
        anglez_simirality = cosine_similarity(anglez_window, before_date_anglez_window)
        # only cos sim以上のsimilarityになるか、std_cos_sim以上のsimilarityかつstd_thr以下
        if anglez_simirality > only_cos_sim_thr:
            count += 1
    # 検出モデルに入力すべきものを1、スルーして無検出とすべきものを0にする
    detect_target = 1 - int(count > 0)
    return detect_target


# Read data

In [5]:
series_df = pl.read_parquet("/kaggle/input/train_series_addkey.parquet")
# series_df = pl.read_parquet("/kaggle/input/train_series_alldata_skffold.parquet")


: 

In [None]:
event_df = pl.read_csv("/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv")


# preprocess

In [None]:
event_df = get_date(event_df)
event_df = set_seriesdate_key(event_df)


# get similar data

In [None]:
series_keys = series_df.select("key").unique(subset="key", maintain_order=True)
series_keys = list(series_keys["key"])
print(len(series_keys))


7746


In [None]:
series_id_list, key_list = [], []
min_step_list, max_step_list = [], []
detect_pred_list, detect_target_list = [], []
before_date_anglez = np.zeros((1, 1))
loader = tqdm(enumerate(series_keys), total=len(series_keys))
# for idx, key in tqdm(enumerate(series_keys)):
series_id_log = ""
tmp_detect_pred = 1
for idx, key in loader:
    target_series = series_df.filter(pl.col("key")==key)
    min_step = target_series.get_column("step").min()
    max_step = target_series.get_column("step").max()
    series_id = target_series.get_column("series_id")[0]
    anglez = np.array(target_series.get_column("anglez")).reshape(1,-1)
    if anglez.shape[1]==before_date_anglez.shape[1]:
        detect_pred = detect_similarity_data(anglez, before_date_anglez)
    else:
        detect_pred = 1
    before_date_anglez = np.array(target_series.get_column("anglez")).reshape(1, -1)
    # seriesが切り替わったときに、nanデータからはじまったとき
    if series_id_log != series_id and detect_pred == 0:
        if series_id_log != "" and tmp_detect_pred == 0:
            series_id_list.append(series_id_log)
            min_step_list.append(min_step_log)
            max_step_list.append(max_step_log)
        min_step_log = min_step
        max_step_log = max_step
        series_id_log = series_id
    # seriesが切り替わったときに、nanデータからはじまらなかったとき
    elif series_id_log != series_id and detect_pred == 1:
        if series_id_log != "" and tmp_detect_pred == 0:
            series_id_list.append(series_id_log)
            min_step_list.append(min_step_log)
            max_step_list.append(max_step_log)
        pass
    # seriesが同じで、nanデータが続いているとき
    elif series_id_log == series_id and tmp_detect_pred == 0 and detect_pred == 0:
        max_step_log = max_step
    # seriesが同じで、nanデータが切れたとき
    elif series_id_log == series_id and tmp_detect_pred == 0 and detect_pred == 1:
        series_id_list.append(series_id_log)
        min_step_list.append(min_step_log)
        max_step_list.append(max_step_log)
    # seriesが同じで、nanデータがはじまったとき
    elif series_id_log == series_id and tmp_detect_pred == 1 and detect_pred == 0:
        min_step_log = min_step
        max_step_log = max_step
        series_id_log = series_id
    # seriesが同じで、nanじゃないデータが続いているとき
    elif series_id_log == series_id and tmp_detect_pred == 1 and detect_pred == 1:
        pass
    tmp_detect_pred = detect_pred
    


  0%|          | 0/7746 [00:00<?, ?it/s]

In [None]:
df = pd.DataFrame({
        "series_id": series_id_list,
        "key": series_id_list,
        "min_step": min_step_list,
        "max_step": max_step_list,
})
display(df)


Unnamed: 0,series_id,min_step,max_step
0,03d92c9f6f8a,92160,178559
1,03d92c9f6f8a,282240,714239
2,0402a003dae9,56340,90899
3,04f547b8017d,471600,627119
4,062cae666e2a,53460,208979
...,...,...,...
271,f981a0805fd0,22680,39959
272,f981a0805fd0,91800,402839
273,fa149c3c4bde,346320,398159
274,fb223ed2278c,108720,143279


検出モデルにかけるべきもののpred, targetを1, 未検出とすべきものを0とする

In [None]:
df.to_parquet("/kaggle/working/detect_target_series.parquet", index=False)
