In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

import torch
import torch.nn as nn
from torch.autograd import Variable
from itertools import groupby
import gc

from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

np.random.seed(0)
pd.set_option('display.precision', 2)  
pd.set_option('display.float_format', '{:.2f}'.format) 

# Find Train Series

In [27]:
train_events = pd.read_csv("/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv")

In [28]:
series_has_nan = train_events.groupby('series_id')['step'].apply(lambda x: x.isnull().any())
series_clean = series_has_nan[~series_has_nan].index.tolist()
series_clean[:4]

['08db4255286f', '0a96f4993bd7', '0cfc06c129cc', '1087d7b0ff2e']

We are only concerned with boundary data and obtain the first 60 data of each onset or wakeup for training.
test_pred_value[:1]

In [29]:
import torch.multiprocessing as mp
import polars as pl
import functools

# @functools.lru_cache(maxsize=None)
def get_one_series(series):
    """
    The default lru_cache method does not multi processing, The function @lru_cache uses an in memory cache. Each python process contains its own memory block.
    """
    print(f'fetch {series} \n')
    one_series = pd.read_parquet("/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet", filters=[('series_id','=',series)])
    return one_series

def get_multi_light_series(series_ids):
    print(f'fetch {series_ids} \n')
    multi_series = pd.read_parquet("/kaggle/input/zzzs-lightweight-training-dataset-target/Zzzs_train.parquet", filters=[('series_id','in',series_ids)])
    return multi_series


def get_train_series(series):
    one_series = get_one_series(series)
    one_events = train_events.query('series_id == @series')
    
    one_series.dropna(inplace=True)
    one_events.loc[:, "step"]  = one_events["step"].astype("int")

    train = pd.merge(one_series, one_events[['step','event']], on='step', how='left')
    event_idx = ~train['event'].isna()
    
    before_event_idx = ~train['event'].shift(-12*60*2).isna() # window_size 
    after_event_idx = ~train['event'].shift(12*60*2).isna() # window_size 

    event_windows = []
    for begin, center, end in zip(before_event_idx.index[before_event_idx],event_idx.index[event_idx], after_event_idx.index[after_event_idx]):
        one_window = train[begin:end].copy()
        last_evt = one_window['event'][center]
        one_window.loc[begin:center, 'event'] = 'awake' if last_evt == 'onset' else 'sleep' 
        one_window.loc[center:end, 'event'] = 'sleep' if last_evt == 'onset' else 'awake' 
        event_windows.append(one_window)

    event_windows = pd.concat(event_windows)
    event_windows =  add_features(event_windows)
    del one_series
    gc.collect()
    return event_windows

features = ["hour","anglez_times_enmo",
            "anglez","anglez_diff","anglez_mean","anglez_min","anglez_max","anglez_std","anglez_diff_rolling",
            "enmo","enmo_diff","enmo_mean","enmo_min","enmo_max","enmo_std","enmo_diff_rolling"]

def add_features(df):
    df["timestamp"] = pd.to_datetime(df["timestamp"],utc=True)
    df["hour"] = df["timestamp"].dt.hour

    df["anglez_times_enmo"] = abs(df["anglez"]) * df["enmo"]

    periods = 6 # 1/2 minute
    df["anglez_diff"] = df.groupby('series_id')['anglez'].diff(periods=periods).fillna(method="bfill")
    df["enmo_diff"]   = df.groupby('series_id')['enmo'].diff(periods=periods).fillna(method="bfill")
    
    df["anglez_mean"] = df["anglez"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill")
    df["enmo_mean"]   = df["enmo"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill")
    
    return df

# Fetching series using multiple Threads
def get_multi_series(series_ids, n_workers=1, light=True):
    if light:
        multi_series = get_multi_light_series(series_ids)
        multi_series = add_features(multi_series)
        return multi_series
    n_workers = min(n_workers, len(series_ids)) # Sometimes series_ids length are smaller than n_workers
    if len(series_ids) == 1:
        return get_train_series(series_ids[0])
    print(f'start {n_workers} threads')
    with mp.Pool(n_workers) as ex:
        train_all = ex.map(get_train_series, series_ids)
    return pd.concat(train_all)

In [30]:
# %time train_all = get_train_series(series_clean[0])
%time train_all = get_multi_series(series_clean[:4])

fetch ['08db4255286f', '0a96f4993bd7', '0cfc06c129cc', '1087d7b0ff2e'] 

CPU times: user 14.7 s, sys: 1.69 s, total: 16.4 s
Wall time: 14.8 s


In [31]:
train_all.dtypes

series_id                         object
step                              uint32
timestamp            datetime64[ns, UTC]
anglez                           float32
enmo                             float32
awake                              int64
hour                               int32
anglez_times_enmo                float32
anglez_diff                      float32
enmo_diff                        float32
anglez_mean                      float64
enmo_mean                        float64
dtype: object

In [32]:
train_all.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo,awake,hour,anglez_times_enmo,anglez_diff,enmo_diff,anglez_mean,enmo_mean
0,08db4255286f,0,2018-11-05 14:00:00+00:00,-30.85,0.04,1,14,1.38,0.33,0.06,-33.75,0.06
1,08db4255286f,1,2018-11-05 14:00:05+00:00,-34.18,0.04,1,14,1.51,0.33,0.06,-33.75,0.06
2,08db4255286f,2,2018-11-05 14:00:10+00:00,-33.88,0.05,1,14,1.64,0.33,0.06,-33.75,0.06
3,08db4255286f,3,2018-11-05 14:00:15+00:00,-34.28,0.07,1,14,2.33,0.33,0.06,-33.75,0.06
4,08db4255286f,4,2018-11-05 14:00:20+00:00,-34.39,0.08,1,14,2.64,0.33,0.06,-33.69,0.07


In [33]:
ts = pd.Series(train_all.loc[:, 'anglez'].values, index=train_all['timestamp'])
ts

timestamp
2018-11-05 14:00:00+00:00   -30.85
2018-11-05 14:00:05+00:00   -34.18
2018-11-05 14:00:10+00:00   -33.88
2018-11-05 14:00:15+00:00   -34.28
2018-11-05 14:00:20+00:00   -34.39
                             ...  
2018-04-06 12:59:35+00:00   -61.07
2018-04-06 12:59:40+00:00   -60.99
2018-04-06 12:59:45+00:00   -61.09
2018-04-06 12:59:50+00:00   -61.02
2018-04-06 12:59:55+00:00   -60.85
Length: 1492740, dtype: float32

In [34]:
from pandas.tseries.offsets import Minute
ts.resample('T').sum()  # Resample by minute

timestamp
2018-03-13 18:15:00+00:00   -1053.60
2018-03-13 18:16:00+00:00   -1053.50
2018-03-13 18:17:00+00:00   -1053.48
2018-03-13 18:18:00+00:00   -1053.58
2018-03-13 18:19:00+00:00   -1053.50
                              ...   
2019-01-08 22:55:00+00:00      25.28
2019-01-08 22:56:00+00:00      10.91
2019-01-08 22:57:00+00:00      30.82
2019-01-08 22:58:00+00:00     -15.41
2019-01-08 22:59:00+00:00      -0.79
Freq: T, Length: 433725, dtype: float32

In [35]:
ts.resample('T').ohlc()

Unnamed: 0_level_0,open,high,low,close
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-03-13 18:15:00+00:00,-87.81,-87.79,-87.81,-87.80
2018-03-13 18:16:00+00:00,-87.80,-87.77,-87.80,-87.80
2018-03-13 18:17:00+00:00,-87.77,-87.77,-87.80,-87.77
2018-03-13 18:18:00+00:00,-87.80,-87.78,-87.81,-87.78
2018-03-13 18:19:00+00:00,-87.78,-87.77,-87.81,-87.80
...,...,...,...,...
2019-01-08 22:55:00+00:00,6.90,7.12,-2.85,5.25
2019-01-08 22:56:00+00:00,7.12,7.12,-4.65,0.78
2019-01-08 22:57:00+00:00,2.68,11.02,-2.43,-0.51
2019-01-08 22:58:00+00:00,-1.74,8.45,-7.54,8.45


In [22]:
# ts2 = pd.DataFrame(train_all.loc[:, ['anglez', 'enmo']].values, index=train_all['timestamp'])
ts2 = train_all.loc[:, ['timestamp','anglez', 'enmo']].copy().set_index('timestamp')
ts2

Unnamed: 0_level_0,anglez,enmo
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-11-05 14:00:00+00:00,-30.85,0.04
2018-11-05 14:00:05+00:00,-34.18,0.04
2018-11-05 14:00:10+00:00,-33.88,0.05
2018-11-05 14:00:15+00:00,-34.28,0.07
2018-11-05 14:00:20+00:00,-34.39,0.08
...,...,...
2018-12-01 01:29:35+00:00,-30.11,0.00
2018-12-01 01:29:40+00:00,-31.86,0.00
2018-12-01 01:29:45+00:00,-30.58,0.00
2018-12-01 01:29:50+00:00,-31.43,0.01


In [44]:
ts2 = train_all.loc[:, ['timestamp','series_id','anglez', 'enmo']].copy()
time_key = pd.Grouper(freq='1min')
ts2 = ts2.set_index('timestamp').groupby(["series_id",time_key]).ohlc()
ts2

Unnamed: 0_level_0,Unnamed: 1_level_0,anglez,anglez,anglez,anglez,enmo,enmo,enmo,enmo
Unnamed: 0_level_1,Unnamed: 1_level_1,open,high,low,close,open,high,low,close
series_id,timestamp,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
08db4255286f,2018-11-05 14:00:00+00:00,-30.85,-30.51,-34.93,-31.30,0.04,0.11,0.04,0.07
08db4255286f,2018-11-05 14:01:00+00:00,-29.06,-24.54,-29.64,-24.81,0.06,0.07,0.04,0.04
08db4255286f,2018-11-05 14:02:00+00:00,-18.26,-18.26,-31.82,-25.42,0.05,0.07,0.04,0.05
08db4255286f,2018-11-05 14:03:00+00:00,-29.94,-26.34,-33.25,-29.93,0.05,0.20,0.05,0.20
08db4255286f,2018-11-05 14:04:00+00:00,-29.08,-23.55,-29.33,-24.45,0.14,0.17,0.05,0.05
...,...,...,...,...,...,...,...,...,...
1087d7b0ff2e,2018-04-06 12:55:00+00:00,-60.51,-60.51,-60.74,-60.53,0.01,0.01,0.01,0.01
1087d7b0ff2e,2018-04-06 12:56:00+00:00,-60.62,-60.50,-61.02,-61.02,0.01,0.01,0.01,0.01
1087d7b0ff2e,2018-04-06 12:57:00+00:00,-61.04,-60.91,-61.06,-61.04,0.01,0.01,0.01,0.01
1087d7b0ff2e,2018-04-06 12:58:00+00:00,-60.98,-60.98,-61.12,-61.09,0.01,0.01,0.01,0.01


# Prepare Data

The most memory-consuming operations

In [None]:
# features = ["anglez","enmo"]
feature_y = 'awake'

df_train_X = train_all[features]
df_train_y = train_all[feature_y]

scaler = StandardScaler()
labeled = LabelEncoder()

df_train_X_scaled = scaler.fit_transform(df_train_X)
df_train_y_values = labeled.fit_transform(df_train_y)

n_past = 12 * 2 # 2 minute
trainX = []
trainY = []

for i in range(n_past, len(df_train_X_scaled)):
    trainX.append(df_train_X_scaled[i - n_past:i, 0:df_train_X_scaled.shape[1]])
    trainY.append(df_train_y_values[i])
    
trainX, trainY = np.array(trainX), np.array(trainY)
trainX.shape, trainY.shape

In [None]:
labeled.classes_

## Split Data

In [None]:
from torch import tensor
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F

def_device = 'mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu'
def_device

In [None]:
split_index = int(len(trainX) * 0.8)

X_train = trainX[:split_index]
X_val = trainX[split_index:]

y_train = trainY[:split_index]
y_val = trainY[split_index:]

X_train.shape, y_train.shape, X_val.shape, y_val.shape

## Create DataLoader

In [None]:
X_train = tensor(X_train, dtype=torch.float32)
y_train = tensor(y_train, dtype=torch.long)

X_val = tensor(X_val, dtype=torch.float32)
y_val = tensor(y_val, dtype=torch.long)

X_train.shape, X_val.shape, y_train.shape, y_val.shape,X_train.dtype, y_train.dtype

The second memory-consuming operation

In [None]:
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X, self.y = X, y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, i):
        return self.X[i], self.y[i]
    
train_ds, val_ds = TimeSeriesDataset(X_train,y_train), TimeSeriesDataset(X_val, y_val)
batch_size = 12*60 # 1 hour

train_dl, val_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True), DataLoader(val_ds, batch_size=batch_size, shuffle=False)

# Define LSTM Model

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(def_device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(def_device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(self.relu(out[:, -1, :])) 
        return out

In [None]:
input_size = len(features)
hidden_size = 32 # like 1 2 4 32 64
num_layers = 2
output_size = 2

model = LSTM(input_size, hidden_size, num_layers, output_size)
opt = torch.optim.Adam(model.parameters(), lr=0.001)
loss_func = F.cross_entropy

model.to(def_device)
model

# Train

In [None]:
def accuracy(out, yb): 
    return (out.argmax(axis=1) == yb).float().mean() # Evaluate the accuracy of the model.

def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
    for epoch in range(epochs):
        model.train(True)
        for xb, yb in train_dl:
            xb, yb = xb.to(def_device), yb.to(def_device)
            loss = loss_func(model(xb), yb)
            loss.backward()
            opt.step()
            opt.zero_grad()
        
        model.eval()
        with torch.no_grad():
            tot_loss, tot_acc, count = 0., 0.,0
            for xb,yb in valid_dl:
                xb, yb = xb.to(def_device), yb.to(def_device)
                pred = model(xb)
                n = len(xb)
                count += n
                tot_loss += loss_func(pred, yb).item()*n
                tot_acc += accuracy(pred, yb).item()*n
        print(f'epoch:{epoch}, loss: {tot_loss/count:.2f}, accuracy: {tot_acc/count:.2f}')
    return tot_loss/count, tot_acc/count

def predict(x):
    with torch.no_grad():
        predicted = model(x).to('cpu')
    return F.log_softmax(predicted, -1).exp()

In [None]:
%time loss,acc = fit(1, model, loss_func, opt, train_dl, val_dl)

In [None]:
for _, batch in enumerate(train_dl):
    xb, yb = batch[0].to(def_device), batch[1].to(def_device)
    print(xb.shape, yb.shape)
    break

In [None]:
pred = predict(xb)

yb = yb.to('cpu')

print(f'accuracy: {accuracy(pred, yb)}')

pred_value = pred.argmax(axis=1)
act_value = yb

plt.figure(figsize=(37,7))
plt.plot(pred_value, label='Actual Awake')
plt.plot(act_value, label='Predicted Awake')
plt.xlabel('Day')
plt.ylabel('Awake')
plt.legend()
plt.show()

# Predict

In [None]:
import polars as pl
test = (pl.scan_parquet("/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet")
          .collect()
          .to_pandas()
)
test = add_features(test)

In [None]:
test.head()

In [None]:
def data_transform(df):
    df_scaler = scaler.transform(df[features])
    n_past = 12*2 # 2 minute
    testX = []
    for i in range(n_past, len(df_scaler)):
        testX.append(df_scaler[i - n_past:i, 0:df_scaler.shape[1]])
    return tensor(np.array(testX), dtype=torch.float32)
test_tsf = data_transform(test)

In [None]:
test_pred_value = predict(test_tsf.to(def_device))
test_pred_value.shape

In [None]:
test_p = test.iloc[n_past:,].copy() # remove first n_past rows
test_p['score'] = test_pred_value[:,:1]

test_p["not_awake"] = 1-test_p["score"]
smoothing_length = 12
test_p["smooth"] = test_p["not_awake"].rolling(smoothing_length,center=True).mean().fillna(method="bfill").fillna(method="ffill")
test_p["smooth"] = test_p["smooth"].round()

In [None]:
import seaborn as sns

plt.figure(figsize=(37, 15))
fig, axs = plt.subplots(1, 3, figsize=(37, 6))
for series_id,ax in zip(test_p['series_id'].unique().tolist(), axs):
    plot_data = test_p.loc[test_p['series_id'] == series_id, ['step','score','smooth']].copy().set_index('step')
    sns.lineplot(plot_data,  ax=ax)
    ax.set_title(f'score for: {series_id}')
plt.show()

In [None]:
def get_event(df):
    lstCV = zip(df.series_id, df.smooth)
    lstPOI = []
    for (c, v), g in groupby(lstCV, lambda cv: (cv[0], cv[1] != 0 and not pd.isnull(cv[1]))):
        llg = sum(1 for item in g)
        print(c,v,llg)
        if v is False:
            lstPOI.extend([0] * llg)
        else:
            lstPOI.extend(['onset'] + (llg - 2) * [0] + ['wakeup'] if llg > 1 else [0])
    return lstPOI

test_p["event"] = get_event(test_p)

In [None]:
sample_submission = test_p.loc[test_p["event"] != 0]
sample_submission.loc[:, 'score'] = 0.9
sample_submission = sample_submission[["series_id", "step", "event", "score"]].copy()
sample_submission = sample_submission.reset_index(drop=True).reset_index(names="row_id")

sample_submission.head()

In [None]:
sample_submission.to_csv('submission.csv', index=False)