In [11]:
import os
import datetime
import numpy as np
import pandas as pd
import polars as pl

import matplotlib.pyplot as plt

import plotly.graph_objects as go
from plotly.subplots import make_subplots

from tqdm.notebook import tqdm

from sklearn.metrics.pairwise import cosine_similarity 

import warnings
warnings.filterwarnings('ignore')


# functions

In [2]:
def get_date(df_):
    df_ = df_.with_columns(pl.col("timestamp").str.to_datetime().dt.replace_time_zone(None))
    df_ = df_.with_columns(pl.col("timestamp").fill_null(pl.col("timestamp").shift(1)+datetime.timedelta(days=1)))
    df_ = df_.with_columns(pl.col("timestamp").fill_null(pl.col("timestamp").shift(-1)+datetime.timedelta(days=-1)))
    df_ = df_.with_columns(pl.col("timestamp").dt.date().cast(str).alias("date"))
    return df_


In [3]:
def set_seriesdate_key(df_):
    df_ = df_.with_columns(pl.concat_str([pl.col("series_id"), pl.col("date").cast(str)], separator="_").alias("key"))
    return df_


In [4]:
def detect_similarity_data(anglez, before_date_anglez, cut_window_num=4, only_cos_sim_thr=0.995):
    window_size = int(anglez.shape[1]/cut_window_num)
    count = 0
    for window_idx in range(cut_window_num):
        anglez_window = anglez[:,window_idx*window_size:(window_idx+1)*window_size]
        before_date_anglez_window = before_date_anglez[:, window_idx*window_size:(window_idx+1)*window_size]
        anglez_simirality = cosine_similarity(anglez_window, before_date_anglez_window)
        # only cos sim以上のsimilarityになるか、std_cos_sim以上のsimilarityかつstd_thr以下
        if anglez_simirality > only_cos_sim_thr:
            count += 1
    # 検出モデルに入力すべきものを1、スルーして無検出とすべきものを0にする
    detect_target = 1 - int(count > 0)
    return detect_target


# Read data

In [5]:
series_df = pl.read_parquet("/kaggle/input/train_series_addkey.parquet")


In [6]:
event_df = pl.read_csv("/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv")


# preprocess

In [7]:
event_df = get_date(event_df)
event_df = set_seriesdate_key(event_df)


# get similar data

In [8]:
series_keys = series_df.select("key").unique(subset="key", maintain_order=True)
series_keys = list(series_keys["key"])
print(len(series_keys))


7746


In [13]:
series_id_list, key_list = [], []
min_step_list, max_step_list = [], []
detect_pred_list, detect_target_list = [], []
before_date_anglez = np.zeros((1, 1))
loader = tqdm(enumerate(series_keys), total=len(series_keys))
# for idx, key in tqdm(enumerate(series_keys)):
for idx, key in loader:
    #     print("series date key", key)
    target_series = series_df.filter(pl.col("key")==key)
    min_step = target_series.get_column("step").min()
    max_step = target_series.get_column("step").max()
    series_id = target_series.get_column("series_id")[0]
    target_event = event_df.filter(pl.col("key")==key)
    if len(target_event) == 0:
        detect_target = 0
    else:
        detect_target = len(target_event) - target_event.get_column("step").null_count()
        detect_target = int(detect_target > 0)
    anglez = np.array(target_series.get_column("anglez")).reshape(1,-1)
    if anglez.shape[1]==before_date_anglez.shape[1]:
        detect_pred = detect_similarity_data(anglez, before_date_anglez)
    else:
        detect_pred = 1
    before_date_anglez = np.array(target_series.get_column("anglez")).reshape(1, -1)

    series_id_list.append(series_id)
    key_list.append(key)
    min_step_list.append(min_step)
    max_step_list.append(max_step)
    detect_pred_list.append(detect_pred)
    detect_target_list.append(detect_target)


  0%|          | 0/7746 [00:00<?, ?it/s]

In [14]:
df = pd.DataFrame({
        "series_id": series_id_list,
        "key": key_list,
        "min_step": min_step_list,
        "max_step": max_step_list,
        "pred": detect_pred_list,
        "target": detect_target_list
})
display(df)


Unnamed: 0,series_id,key,min_step,max_step,pred,target
0,038441c925bb,038441c925bb_2018-08-14,0,3239,1,0
1,038441c925bb,038441c925bb_2018-08-15,3240,20519,1,1
2,038441c925bb,038441c925bb_2018-08-16,20520,37799,1,1
3,038441c925bb,038441c925bb_2018-08-17,37800,55079,1,1
4,038441c925bb,038441c925bb_2018-08-18,55080,72359,1,1
...,...,...,...,...,...,...
7741,fe90110788d2,fe90110788d2_2017-09-04,520200,537479,1,1
7742,fe90110788d2,fe90110788d2_2017-09-05,537480,554759,1,1
7743,fe90110788d2,fe90110788d2_2017-09-06,554760,572039,1,1
7744,fe90110788d2,fe90110788d2_2017-09-07,572040,589319,1,1


検出モデルにかけるべきもののpred, targetを1, 未検出とすべきものを0とする

In [15]:
# df.to_parquet("detect_target_series.parquet", index=False)


In [16]:
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(df["target"].values, df["pred"].values)
tn, fp, fn, tp = conf_mat.ravel()
print("tn:", tn, "  fp:", fp, "  fn:", fn, "  tp:",tp)


tn: 1563   fp: 1368   fn: 19   tp: 4796
