In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

!python -m pip install lightning


Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [None]:
# TODO: need to test with 1 minute resampled dataset
# Main Idea has now been adapted to WaveNet - FPN1D + resnet replaced with WaveNetBlocks, and classifier replaced with GRU + FC
# May need to check dataloader shapes and forward function shapes. Permutes may not be necessary.
# Note in original wavenet approach had input channel 3, not 4 (This was done in preprocessing, could be due to some NN preference).

In [None]:
# Data.py from Martin E's repo. Contains some data prep related code. Quite impoirtant to understand this.
# Important also to understand data size for model modifications

"""
Time Encoding
The function time_encoding is used to convert timestamps in a DataFrame to a numerical form that can be used by machine learning models.
It does this by mapping each minute of the day to a sinusoidal wave value, likely with the intent to capture cyclical patterns in the data
(such as circadian rhythms).


Dataset Sampling:
The CMITimeSeriesSampler class extends PyTorch's Dataset class.
It is designed to handle time-series data for a sleep detection dataset by sampling a fixed-size segment (sample_size) from a longer time series.
The sampling ensures that the segments are continuous in time by checking the difference in a step column.
This approach is useful for handling large time-series datasets that are too large to process in full
or when the model benefits from learning from shorter, contiguous segments.


DataModule:
The CMIDataModule class is a part of the PyTorch Lightning framework, which streamlines the process of organizing your data loading.
It defines a setup method that prepares the data for the model by reading from a parquet file, normalizing a feature, encoding timestamps,
and then creating a dataset object with the specified features and target.
It further splits the dataset into training and validation sets.
Data loaders for training and validation are also provided, which allow for easy batching and shuffling of the data for use in training loops.


"""

from typing import Tuple

import pandas as pd
import numpy as np
import os
import torch
from torch.utils.data import Dataset, DataLoader, random_split

import lightning.pytorch as pl


def time_encoding(ts_df: pd.DataFrame):
    """Use most common awake and onset times as a prior for encoding the timestamp to
    a numerical value. Encoding is done at minute resolution

    * It calculates two types of sinusoidal values based on the minute of the day .
    * Maps these to a predefined circadian rhythm pattern using sinusoidal functions raised to the 24th power, which likely sharpens the curve.
    * The timestamp is decomposed into minutes and these minutes are mapped to the pre-calculated sinusoidal values to create a continuous circadian pattern for dataset.
    * Two new columns, onset prior and awake prior, created.
    """
    n_mins_day = 60 * 24 # If we arent resampling with 1 min, we need to change this to the correct timescale of the dataset
    awake_prior_vals = np.sin(np.linspace(0, np.pi, n_mins_day) + 0.208 * np.pi) ** 24
    awake_prior_dict = dict(zip(range(1440), awake_prior_vals))
    onset_prior_vals = np.sin(np.linspace(0, np.pi, n_mins_day) + 0.555 * np.pi) ** 24
    onset_prior_dict = dict(zip(range(1440), onset_prior_vals))
    time_df = pd.DataFrame()
    time_df["onset_prior"] = (
        (ts_df.timestamp.dt.hour * 60 + ts_df.timestamp.dt.minute)
        .map(onset_prior_dict)
        .astype(np.float32)
    )
    time_df["awake_prior"] = (
        (ts_df.timestamp.dt.hour * 60 + ts_df.timestamp.dt.minute)
        .map(awake_prior_dict)
        .astype(np.float32)
    )
    return time_df


class CMITimeSeriesSampler(Dataset):
    """Dataset class for sampling time-series from the CMI Sleep Detection dataset.
    A random time-series of size 'sample_size' is sampled from the specified series
    index"""

    def __init__(
        self, series_df, sample_size: int, feat_cols: list, target_col: str
    ) -> None:
        super().__init__()
        self.feat_cols = feat_cols
        self.target_col = target_col
        self.series_df = series_df
        self.series_grps = series_df.groupby(by="series_id")
        self.series_ids = list(self.series_grps.groups.keys())
        self.sample_size = sample_size

    def check_timeseries_continuity(self, ts_df):
        # Ensures that the sampled time series is continuous (i.e., there are no gaps in the data).
        # Computes the difference between consecutive steps and checks if all differences are equal, implying continuity.
        step_sizes = ts_df["step"].diff()[1:].astype(int)
        is_cont = (step_sizes == step_sizes.iloc[0]).all()
        return is_cont

    def __len__(self):
        return len(self.series_ids)

    def __getitem__(self, index):
        #  Retrieves a continuous sample of the specified size from the dataset.
        # Finds a random start index that allows for a contiguous segment of the desired sample_size.
        # Samples the data, ensuring it is continuous.
        # Extracts the features and target values for the sampled segment and returns them as a tuple.
        sid = self.series_ids[index]
        sample_df = self.series_grps.get_group(sid)
        sample_df = sample_df.reset_index(drop=True)
        assert len(sample_df) > self.sample_size

        is_cont_ts = False
        while ~is_cont_ts:
            start_idx = np.random.randint(0, len(sample_df) - self.sample_size)
            end_idx = start_idx + self.sample_size
            ts_df = sample_df[start_idx:end_idx]
            is_cont_ts = self.check_timeseries_continuity(ts_df)
        #print(ts_df.columns)
        assert all(col in ts_df for col in ["anglez_1min_mean", "enmo_1min_mean", "onset_prior", "awake_prior"]), "Required columns are missing in the dataframe!"
        X_data = ts_df[self.feat_cols].T.to_numpy()
        y_data = ts_df[self.target_col].to_numpy().astype(np.float32)

        return X_data, y_data


class CMIDataModule(pl.LightningDataModule):
    def __init__(self, datapath: str, batch_size: int, sample_size: int):
        super().__init__()
        self.datapath = datapath
        self.batch_size = batch_size
        self.sample_size = sample_size

    def setup(self, stage: str):
        #  Prepares the data for the model.
        # Loads the dataset from the given path.
        # Performs feature normalization and time encoding.
        # Creates a PyTorch Dataset object using the preprocessed data.
        # Splits the dataset into training and validation sets using a random split with a fixed seed for reproducibility.
        series_df = pd.read_parquet(self.datapath)
        series_df["anglez_1min_mean"] = series_df["anglez_1min_mean"] / 90.0
        time_enc_df = time_encoding(series_df)
        series_df = pd.concat([series_df, time_enc_df], axis=1)

        feat_cols = ["anglez_1min_mean", "enmo_1min_mean", "onset_prior", "awake_prior"]
        target_col = "asleep"
        dset = CMITimeSeriesSampler(
            series_df,
            sample_size=self.sample_size,
            feat_cols=feat_cols,
            target_col=target_col,
        )
        self.train_dset, self.val_dset = random_split(
            dset, [0.8, 0.2], generator=torch.Generator().manual_seed(42)
        )

    def train_dataloader(self):
        # Creates a DataLoader for the training set.
        return DataLoader(self.train_dset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        # Creates a DataLoader for the validation set.
        return DataLoader(self.val_dset, batch_size=self.batch_size, shuffle=False)

In [None]:
# Model.py from Martin E's repo. Contains the model structure. This should be replaced for WaveNet Adaptation. Need to understand the data format and interface.

from typing import Tuple

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import lightning.pytorch as pl



##################################################################### Following is WaveNet-LSTM APproach
# TODO is to replace FPN1D + ResNetBlocks with these
# ResNetBlocks have the same input and output size
class Wave_Block(nn.Module):

    def __init__(self, in_channels, out_channels, dilation_rates, kernel_size):
        super(Wave_Block, self).__init__()
        self.num_rates = dilation_rates
        self.convs = nn.ModuleList()
        self.filter_convs = nn.ModuleList()
        self.gate_convs = nn.ModuleList()

        self.convs.append(nn.Conv1d(in_channels, out_channels, kernel_size=1))
        dilation_rates = [2 ** i for i in range(dilation_rates)]
        for dilation_rate in dilation_rates:
            self.filter_convs.append(
                nn.Conv1d(out_channels, out_channels, kernel_size=kernel_size, padding=int((dilation_rate*(kernel_size-1))/2), dilation=dilation_rate))
            self.gate_convs.append(
                nn.Conv1d(out_channels, out_channels, kernel_size=kernel_size, padding=int((dilation_rate*(kernel_size-1))/2), dilation=dilation_rate))
            self.convs.append(nn.Conv1d(out_channels, out_channels, kernel_size=1))

    def forward(self, x):
        x = self.convs[0](x)
        res = x
        for i in range(self.num_rates):
            x = torch.tanh(self.filter_convs[i](x)) * torch.sigmoid(self.gate_convs[i](x))
            x = self.convs[i + 1](x)
            res = res + x
        return res



class ClassificationHead_WaveNet(nn.Module):
    def __init__(self, in_chs) -> None:
        super().__init__()
        self.fc = nn.Linear(in_chs, 1) # Modified Linear Layer to be 1 instead of original 3.
        #self.fc2 = nn.Linear(128, 1)


    """
    Regarding squeezing, essentially it is to conver the data into 1D format
    x = self.conv1(x).squeeze(): If the kernel size for conv1 equals the length of the input, this convolution would produce an output with a single time-step (length 1). Squeezing this result would remove the redundant dimension, leaving a 2D tensor
    y_logits = self.conv2(x).squeeze(): Since conv2 reduces the channel dimension to 1, the output of conv2 would have the shape [batch_size, 1, length]. Squeezing it would remove the singleton channel dimension, leaving a 2D tensor [batch_size, length]

    # We could replace the approach with a fully connected layer, as shown below for conv2
    class ClassificationHead(nn.Module):
    def __init__(self, in_chs, kernel_size, num_features) -> None:
        super().__init__()
        self.conv1 = nn.Conv1d(
            in_chs, in_chs // 4, kernel_size=kernel_size, padding="same"
        )
        # Assuming that the convolutional layers do not change the length of the input,
        # we can calculate the number of features as in_chs//4 * length_of_sequence
        self.fc = nn.Linear(in_chs // 4 * num_features, 1)  # num_features is the length of the sequence

    def forward(self, x: torch.FloatTensor) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
        x = F.relu(self.conv1(x))  # Typically, we apply a non-linearity after the convolution
        x = x.view(x.size(0), -1)  # Flatten the tensor for the fully connected layer
        y_logits = self.fc(x).squeeze()  # Fully connected layer
        y_proba = torch.sigmoid(y_logits)  # Apply sigmoid to get probabilities
        return y_logits, y_proba
    TLDR, we are able to replace the conv layers with FC1 layers that should be faster to work with, LSTM-compatible, and and in alignment with original wavenet approach.
    """
    def forward(
        self, x: torch.FloatTensor
    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
        #x = self.fc(x)
        #x = self.fc2(x)
        y_logits = self.fc(x).squeeze()
        y_proba = F.sigmoid(y_logits)
        return y_logits, y_proba



class EventDetectionCNN(nn.Module):
    def __init__(self, in_chs: int, kernel_size=3) -> None:
        super().__init__()

        # Input layer consits of the input size
        # The output channel for Conv1D is feat_chs, but due to concatenation of all 3 steps and batchnorm,
        # The input for resnet blocks is hence resnetblock1D

       #Also replacing FPN1D with waveblocks, as they already have dilution made available
        """
        #in_chs =4, feat_chs =32
        self.input_layer = FPN1D(in_chs, feat_chs, kernel_size=31)
        """

        #Aim to replace the following original code with wavenet
        """
        # Resnetblock input and output is the same size.
        #Original code
        self.resnet_blocks = nn.Sequential(
            *[ResNetBlock1D(resnet_feat_chs, kernel_size=11)] * n_resnet_blocks
        )
        """
        #In_chs = 4. This is the same as in original FPN1D
        self.wave_block1 = Wave_Block(in_chs, 32, 8, kernel_size)
        self.wave_block2 = Wave_Block(32, 64, 4, kernel_size)
        self.wave_block3 = Wave_Block(64, 128, 1, kernel_size)

        self.LSTM = nn.GRU(input_size=128, hidden_size=128, num_layers=4,
                           batch_first=True, bidirectional=True)



        self.cls_head = ClassificationHead_WaveNet(256)

    def forward(self, x):
        x = x.float() # Convert to float32 tensor
        #print("shape of input")
        #x = x.permute(0, 2, 1)  # Permute needed for shape. Note, May not be exactly what we want for this project
        x = self.wave_block1(x)
        x = self.wave_block2(x)
        x = self.wave_block3(x)
        x = x.permute(0, 2, 1)
        x, h = self.LSTM(x) # Permute needed for shape. Note, May not be exactly what we want for this project
        y_logits, y_proba = self.cls_head(x)
        return y_logits, y_proba

    def predict_from_df(self, df, feat_cols):
        with torch.no_grad():
            x = torch.tensor(df[feat_cols].T.to_numpy()).float().unsqueeze(0)
            _, y_proba = self.forward(x)
        return y_proba.numpy()

# This is the main model module that gets called.
# Modified for WaveNet
class CMISleepDetectionWaveNetCNN(pl.LightningModule):
    # in_chs =4
    def __init__(self, in_chs: int):
        super().__init__()
        self.model = EventDetectionCNN(in_chs)
        self.bce_loss = nn.BCEWithLogitsLoss()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x_in, y_target = batch
        y_logits, _ = self(x_in)
        loss = self.bce_loss(y_logits, y_target)
        self.log("train/loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x_in, y_target = batch
        y_logits, y_proba = self(x_in)
        val_loss = self.bce_loss(y_logits, y_target)
        self.log("val/loss", val_loss)

        # calculate accuracy
        y_pred = y_proba > 0.5
        y_target = y_target > 0.5
        val_acc = torch.sum(y_pred == y_target) / y_target.nelement()
        self.log("val/acc", val_acc)

        return val_loss, val_acc

    def configure_optimizers(self):
        return torch.optim.Adam(self.model.parameters(), lr=2e-4)

In [None]:
# https://en.wikipedia.org/wiki/PyTorch_Lightning

import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint




datapath = "/kaggle/input/cmi-train-1min-resampled/train_series_1min.parquet"
datamodule = CMIDataModule(datapath, batch_size=128, sample_size=60 * 12)
#input channel is 4, because we have 4 feature cols ["anglez_1min_mean", "enmo_1min_mean", "onset_prior", "awake_prior"]
# Original wavenet has 3 channels. Created by rolling window over a single wav file. Maybe use it.

#Input size: torch.Size([128, 4, 720])
#Target size: torch.Size([128, 720])



In [None]:
model = CMISleepDetectionWaveNetCNN(in_chs=4)

model_ckpt_callback = ModelCheckpoint(save_top_k=1, monitor="val/acc", mode="max")
early_stop_callback = EarlyStopping(monitor="val/acc", mode="max", patience=10)
trainer = pl.Trainer(
    log_every_n_steps=1,
    check_val_every_n_epoch=5,
    max_epochs=100,
    accelerator="cuda",
    callbacks=[early_stop_callback, model_ckpt_callback],
)
trainer.fit(model, datamodule)# Train.py

In [None]:
%reload_ext tensorboard

In [None]:
%tensorboard --logdir /kaggle/working/lightning_logs/