In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier
from scipy.signal import savgol_filter, medfilt, find_peaks
from scipy.ndimage import binary_closing, binary_dilation
from scipy.signal.windows import triang
from scipy.ndimage import convolve1d
from tqdm.notebook import tqdm

import cmi_preprocessing
import cmi_feature_extraction

### Load train and test data

In [None]:
train_datapath = "/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet"
raw_train_df = pd.read_parquet(train_datapath)

In [None]:
test_datapath = "/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet"
raw_test_df = pd.read_parquet(test_datapath)

### Resample data to 1min instead of 5sec frequency

In [None]:
raw_train_df["enmo"] = raw_train_df["enmo"].clip( 0.0, 3.0)
ts_train_1min_df = cmi_preprocessing.resample_timeseries_data_to_1min(raw_train_df)

In [None]:
raw_test_df["enmo"] = raw_test_df["enmo"].clip( 0.0, 3.0)
ts_test_1min_df = cmi_preprocessing.resample_timeseries_data_to_1min(raw_test_df)

### Add binary target column that indicates whether people are asleep

In [None]:
events_path = "/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv"
raw_events_df = pd.read_csv(events_path)

In [None]:
events_df = raw_events_df.copy()
events_df = cmi_preprocessing.preprocess_events(events_df)

In [None]:
ts_train_1min_df = cmi_preprocessing.add_asleep_target_to_timeseries_data(ts_train_1min_df, events_df)

### Extract rolling features

In [None]:
def calculate_first_order_variation(ts_1min_df, var_col):
    first_order_var = ts_1min_df.groupby(by="series_id")[var_col].diff().abs()
    first_order_var[first_order_var.isna()] = 0
    return first_order_var

In [None]:
ts_train_1min_df["anglez_1min_1v"] = calculate_first_order_variation(ts_train_1min_df, "anglez_1min_mean")
ts_train_1min_df["enmo_1min_1v"] = calculate_first_order_variation(ts_train_1min_df, "enmo_1min_mean")
ts_test_1min_df["anglez_1min_1v"] = calculate_first_order_variation(ts_test_1min_df, "anglez_1min_mean")
ts_test_1min_df["enmo_1min_1v"] = calculate_first_order_variation(ts_test_1min_df, "enmo_1min_mean")

In [None]:
awake_df = ts_train_1min_df[ts_train_1min_df["asleep"]==0]
awake_enmo_median = awake_df["enmo_1min_mean"].median()
ts_train_1min_df["awake_count"] = ts_train_1min_df["enmo_1min_mean"]>awake_enmo_median
ts_test_1min_df["awake_count"] = ts_test_1min_df["enmo_1min_mean"]>awake_enmo_median

In [None]:
roll_freqs = ["15min", "30min", "1H"]
agg_funcs = ["mean", "std", "max"]
agg_dict = {
    "enmo_1min_mean": agg_funcs,
    "enmo_1min_1v": agg_funcs,
    "anglez_1min_mean": agg_funcs,
    "anglez_1min_1v": agg_funcs,
    "awake_count": "mean"
}

In [None]:
ts_train_feat_df = cmi_feature_extraction.extract_rolling_features(ts_train_1min_df, agg_dict, roll_freqs)

In [None]:
ts_test_feat_df = cmi_feature_extraction.extract_rolling_features(ts_test_1min_df, agg_dict, roll_freqs)

### Put extra emphasis on transition periods

In [None]:
def rectangular_transition_sample_weights(sample_weights, transition_idxs, transition_weight, window_size=31):
    transition_filter = np.zeros_like(sample_weights)
    transition_filter[transition_idxs] = 1
    transition_filter = binary_dilation(transition_filter, structure=np.ones(window_size))
    transition_filter = transition_filter*transition_weight
    sample_weights += transition_filter
    return sample_weights

In [None]:
def triangular_transition_sample_weights(sample_weights, transition_idxs, transition_weight, window_size=31):
    transition_filter = np.zeros_like(sample_weights)
    transition_filter[transition_idxs] = 1
    tri_window = triang(window_size)*transition_weight
    transition_filter = convolve1d(transition_filter, tri_window)
    sample_weights += transition_filter
    return sample_weights

In [None]:
transition_idxs = ts_train_feat_df.reset_index().merge(events_df, on=["step","series_id"])["index"].values
sample_weights = np.ones(len(ts_train_feat_df))
sample_weights = triangular_transition_sample_weights(sample_weights, transition_idxs, transition_weight=10, window_size=31)

### Train model

In [None]:
feat_cols = ts_train_feat_df.columns.drop(["timestamp", "series_id", "step", "asleep", "awake_count"]).tolist()
target_col = "asleep"
X_train = ts_train_feat_df[feat_cols].copy()
y_train = ts_train_feat_df[target_col].copy()

In [None]:
clf = CatBoostClassifier(n_estimators=500, cat_features=["hour"], verbose=10)
clf.fit(X_train, y_train, sample_weight=sample_weights)
clf.score(X_train, y_train)

### Make predictions

In [None]:
def filter_predictions(y_proba, window_size):
    y_proba_smooth = savgol_filter(y_proba, window_length=window_size, polyorder=2)
    y_pred = (y_proba_smooth > 0.5)*1
    if (window_size%2)==0: window_size+=1 
    y_pred = binary_closing(y_pred, structure=np.ones(window_size))
    return y_pred

In [None]:
def score_sleep_periods(y_proba, onset_idxs, wakeup_idxs):
    onset_scores = []
    wakeup_scores = []
    for start_idx, end_idx in zip(onset_idxs, wakeup_idxs):
        yp = y_proba[start_idx:end_idx]
        lin_weight = np.linspace(0, 1, num=len(yp))
        onset_score_ = np.sum(yp*lin_weight[::-1])/np.sum(lin_weight)
        wakeup_score_ = np.sum(yp*lin_weight)/np.sum(lin_weight)
        onset_scores.append(onset_score_)
        wakeup_scores.append(wakeup_score_)
    return np.array(onset_scores), np.array(wakeup_scores)

In [None]:
def create_event_dataframe(feat_df, event_idxs, event_scores, event_type):
    event_df = feat_df.loc[event_idxs, ["step", "series_id"]]
    event_df["event"] = event_type
    event_df["score"] = event_scores
    return event_df

In [None]:
window_size = 31 # mins
plateau_size = 120 # mins
test_event_dfs = []
series_grps = ts_test_feat_df.groupby(by="series_id")
for series_id, test_feat_df in tqdm(series_grps):
    test_feat_df = test_feat_df.reset_index(drop=True)
    X_test = test_feat_df[feat_cols].copy()
    y_proba = clf.predict_proba(X_test)
    y_proba = y_proba[:, 1]
    y_pred = filter_predictions(y_proba, window_size)
    peaks, plateaus = find_peaks(y_pred, plateau_size=plateau_size)

    # get indices of onset and wake ups
    onset_idxs = plateaus["left_edges"]
    wakeup_idxs = plateaus["right_edges"]

    # score each sleeping event and create event dataframe
    onset_scores, wakeup_scores = score_sleep_periods(y_proba, onset_idxs, wakeup_idxs)
    onset_df = create_event_dataframe(
        test_feat_df, onset_idxs, onset_scores, event_type="onset"
    )
    wakeup_df = create_event_dataframe(
        test_feat_df, wakeup_idxs, wakeup_scores, event_type="wakeup"
    )
    event_df = pd.concat([onset_df, wakeup_df])
    event_df = event_df.sort_values(by="step")
    test_event_dfs.append(event_df)

submission_df = (
    pd.concat(test_event_dfs).reset_index(drop=True).reset_index(names="row_id")
)

In [None]:
submission_df = pd.concat(test_event_dfs).reset_index(drop=True).reset_index(names="row_id")
submission_df.to_csv("submission.csv", index=False)