In [1]:
import numpy as np
#import cudf
#%reload_ext cudf.pandas
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from scipy.signal import get_window
from tqdm.auto import tqdm

!nvidia-smi
!nvcc --version
print(torch.__version__)
torch.cuda.is_available()

#print(torch.__version__)



Sun Dec 17 08:11:17 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

True

In [2]:
raw_train_series = pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet')
print(raw_train_series)

              series_id    step                 timestamp     anglez    enmo
0          038441c925bb       0  2018-08-14T15:30:00-0400   2.636700  0.0217
1          038441c925bb       1  2018-08-14T15:30:05-0400   2.636800  0.0215
2          038441c925bb       2  2018-08-14T15:30:10-0400   2.637000  0.0216
3          038441c925bb       3  2018-08-14T15:30:15-0400   2.636800  0.0213
4          038441c925bb       4  2018-08-14T15:30:20-0400   2.636800  0.0215
...                 ...     ...                       ...        ...     ...
127946335  fe90110788d2  592375  2017-09-08T00:14:35-0400 -27.277500  0.0204
127946336  fe90110788d2  592376  2017-09-08T00:14:40-0400 -27.032499  0.0233
127946337  fe90110788d2  592377  2017-09-08T00:14:45-0400 -26.841200  0.0202
127946338  fe90110788d2  592378  2017-09-08T00:14:50-0400 -26.723900  0.0199
127946339  fe90110788d2  592379  2017-09-08T00:14:55-0400 -31.521601  0.0205

[127946340 rows x 5 columns]


In [3]:
raw_train_events = pd.read_csv('/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv')
raw_test_series = pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet')
print(raw_train_events)
print(raw_test_series)

          series_id  night   event      step                 timestamp
0      038441c925bb      1   onset    4992.0  2018-08-14T22:26:00-0400
1      038441c925bb      1  wakeup   10932.0  2018-08-15T06:41:00-0400
2      038441c925bb      2   onset   20244.0  2018-08-15T19:37:00-0400
3      038441c925bb      2  wakeup   27492.0  2018-08-16T05:41:00-0400
4      038441c925bb      3   onset   39996.0  2018-08-16T23:03:00-0400
...             ...    ...     ...       ...                       ...
14503  fe90110788d2     33  wakeup  560604.0  2017-09-06T04:07:00-0400
14504  fe90110788d2     34   onset  574620.0  2017-09-06T23:35:00-0400
14505  fe90110788d2     34  wakeup  581604.0  2017-09-07T09:17:00-0400
14506  fe90110788d2     35   onset       NaN                       NaN
14507  fe90110788d2     35  wakeup       NaN                       NaN

[14508 rows x 5 columns]
        series_id  step                 timestamp     anglez    enmo
0    038441c925bb     0  2018-08-14T15:30:00-0400   2

In [None]:
# All steps that can be done easier with dataframes done here
def filter_id(raw_data, id_list):
    return raw_data[raw_data['series_id'].isin(id_list)].dropna()

def common_ppro(raw_data):
    data = raw_data.dropna()
    data = data.drop(columns = ['timestamp']) # Each step = 5 seconds
    # data = data.drop(columns = ['step']) # Each column = 1 step
    # turn series_id into int; use range(len(train_ids)) to access ids
    data['series_id'], _ = pd.factorize(data['series_id'])
    data = data.astype(float)
    
    return data

def train_val_split(data, id_list, val_ratio):
    train_n = int(len(id_list) * (1 - val_ratio))
    
    train_data = data[data['series_id'] < train_n]
    val_data = data[data['series_id'] >= train_n]
    
    return train_data, val_data

def dloader(raw_series, raw_events):
    ## restrict to first 3 ids for training; remove later
    id_list = list(set(raw_events['series_id'].tolist()))
    #id_list = id_list[:3]
    series, events = filter_id(raw_series, id_list).dropna(), filter_id(raw_events, id_list).dropna()
    # series, events = (raw_series.dropna(), raw_events.dropna())
    
    ## split series into series_id & nights column
    ## -> training will occur with each night as input
    night_secs = int(3600 * 24 / 5) # 5 secs per step
    series.insert(loc = series.columns.get_loc('series_id') + 1, column = 'night',
               value = series['step'].apply(lambda x : int(x / night_secs)) + 1)
    
    ## filter series for nights where an event occurs
    events_single = events[events['event'] == 'onset']
    series = pd.merge(series, events_single[['series_id', 'night']], on = ['series_id', 'night'], how = 'inner')
    
    ## events to numerical columns
    events = pd.get_dummies(events, columns = ['event'])
    
    ## common preprocessing
    series, events = common_ppro(series), common_ppro(events)
    
    ## train_val_split
    val_ratio = 0.2
    train_series, val_series = train_val_split(series, id_list, val_ratio)
    train_events, val_events = train_val_split(events, id_list, val_ratio)
    
    return train_series, train_events, val_series, val_events

In [None]:
train_series, train_events, val_series, val_events = dloader(raw_train_series, raw_train_events)

def series_to_tensor(series):
    grouped = series.groupby(['series_id', 'night'])
    tensors = {}
    for (sid, night), group in grouped:
        tensors[(sid, night)] = torch.tensor(group.iloc[:, 2:].values, dtype = torch.float32, device = 'cuda')
    return tensors
    
def df_to_tensor(df):
    return torch.tensor(df.values, dtype = torch.float32, device = 'cuda')

train_t, val_t = series_to_tensor(train_series), series_to_tensor(val_series)
train_e, val_e = series_to_tensor(train_events), series_to_tensor(val_events)

test_id_list = list(set(raw_test_series['series_id'].tolist()))
test_series = common_ppro(raw_test_series)
test_t = df_to_tensor(test_series)
# data format : id, step, anglez, enmo, event_onsent, event_wakeup

In [None]:
# preprocessing : all processes are iterated over individual series_id and night
# acess by train_t[(series_id, night)]

## frequency space : short time fourier transform

def label(step, event):
    if event.size(0) == 2:
        onset_s, wakeup_s = event[0][0], event[1][0]
    else:
        if event[0][1] == 1.0: # event_onset
            onset_s, wakeup_s = event[0][0], float('inf')
        else:
            onset_s, wakeup_s = float('-inf'), event[0][0]
            
    label_b = (step >= onset_s) & (step <= wakeup_s)
    return label_b.to(dtype = torch.float32, device = 'cuda')

def segment(data, wdw_size, hop_size):
    # data : N * 3 tensor
    # dim 0 : size N, time series (step); equals data.size(0)
    # dim 1 : size 3 vector : step, anglez, enmo
    
    wdw_cnt = int((data.size(0) - wdw_size - 1) / hop_size) + 1
    shape = (wdw_cnt, wdw_size, 3)
    stride = (hop_size * 3, 3, 1)
    
    # segmented_data : seg_len * wdw_size * 3 tensor
    # dim 0 : size wdw_cnt, column of segments
    # dim 1 : size wdw_size, each segment with wdw_size time series points
    # dim 2 : size 3 vector : step, anglez, enmo
    seg_data = torch.as_strided(data, shape, stride)
    return seg_data

def slidevar(tensor, window_size = 4):
    var = torch.zeros(tensor.size(), device='cuda', dtype=torch.float32)
    
    for i in range(0, tensor.size(0) - window_size + 1, window_size):
        end_idx = min(i + window_size, tensor.size(0) - window_size + 1)
        window = tensor[i:end_idx + window_size - 1]
        var[i:end_idx] = torch.var(window, dim=0, unbiased=False)

    return var

def stft(seg_data):
    average_step = torch.mean(seg_data[:, :, 0], dim = 1).view(-1,1)
    anglez_data = seg_data[:, :, 1]
    enmo_data = seg_data[:, :, 2]
    
    # dim 0 : size wdw_cnt, column of segments
    # dim 1 : size wdw_size, each segment with frequency space magnitude
    anglez_fft = torch.abs(torch.fft.fft(anglez_data, dim = 1))
    enmo_fft = torch.abs(torch.fft.fft(enmo_data, dim = 1))
    
    # dim 0 : column of segments
    # dim 1 : step, angle/enmo frequency space magnitude (1 + 8 + 8 columns)
    # total_fft = torch.concat((average_step, anglez_fft, enmo_fft), dim = 1)

    # anglez_freq_max = torch.argmax(anglez_fft, dim=1).view(-1, 1).float() # Convert indices to float for concatenation
    # enmo_freq_max = torch.argmax(enmo_fft, dim=1).view(-1, 1).float()

    # dim 1 : step, angle/enmo frequency variance + frequency with maximum amplitude (1 + 2 + 2 columns)
    # total_fft = torch.cat((average_step, anglez_var, anglez_freq_max, enmo_var, enmo_freq_max), dim=1)
    anglez_var = slidevar(anglez_fft)
    enmo_var = slidevar(enmo_fft)
    total_fft = torch.cat((average_step, anglez_var, enmo_var), dim=1)
    
    # columnwise regularisation of variance; also performed individually for each dataset
    total_fft[:,1] = total_fft[:,1] / torch.mean(total_fft[:,1])
    total_fft[:,2] = total_fft[:,2] / torch.mean(total_fft[:,2])

    return total_fft

def train_f(datadict, event):
    tensors = {}
    labels = {}
    ## empirical average of movement time : 8 steps
    wdw_size = 8
    ## hop between time segments; not equal to wdw_size to prevent boundary issues
    hop_size = 6
    
    # dictionary of tensors, access by tensors[(sid, night)]
    for sid, night in tqdm(datadict, desc = "Processing"):
        data = datadict[(sid, night)]
        t_seg = stft(segment(data.contiguous(), wdw_size, hop_size))
        # t_seg[:,0] : column of steps
        # event[(sid, night)] : should be in form of ((step, 1, 0), (step, 0, 1))
        t_label = label(t_seg[:,0], event[(sid, night)]).view(-1, 1)
        
        tensors[(sid, night)], labels[(sid, night)] = t_seg, t_label
        
    return tensors, labels

x_train_seg, y_train_seg = train_f(train_t, train_e)
x_val_seg, y_val_seg = train_f(val_t, val_e)

In [None]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

# training
## hyperparamters
epoch_n = 20
eta_p = 0.1 # learning rate
batch_size = 100
C = 0 #0.001

class Data(Dataset):
    def __init__(self, series_data, event_data):
        self.series_data = series_data
        self.event_data = event_data
        self.keys = list(event_data.keys())

    def __len__(self):
        return len(self.series_data)

    def __getitem__(self, idx):
        series = self.series_data[self.keys[idx]]
        event = self.event_data[self.keys[idx]]
        length = series.size(0)
        return series, event, length

def collate(batch):
    series, events, lengths = zip(*batch)
    series_padded = pad_sequence(series, batch_first = True)
    events_padded = pad_sequence(events, batch_first = True)
    
    lengths = torch.tensor(lengths, dtype = torch.float32, device = 'cuda')

    return series_padded, events_padded, lengths

train_dataset = Data(x_train_seg, y_train_seg)
train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True, collate_fn = collate)

"""# heuristic : variance / frequency space of output is relevant
def variance(tensor, window_size = 5):
    var = torch.zeros_like(tensor).to(dtype = torch.float32, device = 'cuda')

    # Compute variance for each window
    for i in range(tensor.size(0) - window_size):
        window = tensor[i:i + window_size]
        var[i] = torch.var(window, unbiased = False)  # Set unbiased to False for sample variance

    return var

reg = 50000 # uniform regularisation of outputs; replace with regularisation of initial frequency data?
thres = 1e-4"""

class SVM(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.lin = nn.Linear(input_dim, 1) # 1 class

    def forward(self, x):
        #outputs = torch.sigmoid(self.lin(x))
        outputs = self.lin(x).to(device = 'cuda')
        #outputs = variance(outputs) / reg # low variance : sleep, high variance : awake, 50000 : regularisation
        return outputs
    
def train_model(model, optimiser, loader):
    model.train()
    for epoch in range(epoch_n):
        total_loss = 0
        for batch_id, (data, target, lengths) in enumerate(loader):
            data, target = data.to(device = 'cuda')[:,:,1:], target.to(device = 'cuda')
            optimiser.zero_grad()
            output = model(data)
            
            hinge_loss = torch.mean(torch.clamp(1 - output * target, min=0)) # error
            loss = hinge_loss + C * torch.norm(model.lin.weight, 2)
            
            loss.backward()
            optimiser.step()
            
            total_loss += loss
        #print(f'Epoch [{epoch+1}/{epoch_n}], Loss: {total_loss/len(loader)}')
    
    return model

In [None]:
def classifier(data):
    outputs = (data < thres).int().to(dtype = torch.float32, device = 'cuda')
    # consider size of window used in variance above; add 2 to each ends of sleep intervals
    # change filter above to incorporate this? existence of value beneath threshold in +-2
    return outputs

def intervals_f(data):
    diff = torch.diff(data, prepend = data[0:1], append = data[-1:])
    starts = (diff == 1).nonzero(as_tuple=True)[0]
    ends = (diff == -1).nonzero(as_tuple=True)[0]
    
    if data[0] == 1:
            starts = torch.cat((torch.tensor([0]).to(device = 'cuda'), starts))
    if data[-1] == 1:
            ends = torch.cat((ends, torch.tensor([len(data) - 1]).to(device = 'cuda')))
            
    intervals = list(zip(starts.tolist(), ends.tolist()))
    
    return intervals

def rm_dstrb(data, max_len_dstrb): # remove disturbances
    """sleep_interval_list = [[] for _ in range(batch_size)]"""
    new_data = torch.zeros(data.shape).to(device = 'cuda')
    
    intervals = intervals_f(data)
    
    curr_int = intervals[0]
    fin_int = (0,0)
    
    for i in range(1, len(intervals)):
        if intervals[i][0] - curr_int[1] < max_len_dstrb:
            curr_int = (curr_int[0], intervals[i][0])
            if i == len(intervals) - 1:
                fin_int = curr_int
        else:
            if (curr_int[1] - curr_int[0]) > (fin_int[1] - fin_int[0]):
                fin_int = curr_int
            curr_int = intervals[i]
    
    new_data[fin_int[0]:fin_int[1]+1] = 1
            
    return new_data

#def sleepinterval(data):
    # obtain sleep interval from raw data, in order to obtain confidence interval
    # consider size of window used in variance above; add 2 to each ends of sleep intervals

def confuse(sid, night, true, pred):
    n = true.size(0)
    t_pos = torch.sum((true == 1) & (pred == 1))/n
    f_pos = torch.sum((true == 0) & (pred == 1))/n
    f_neg = torch.sum((true == 1) & (pred == 0))/n
    t_neg = torch.sum((true == 0) & (pred == 0))/n
    
    accuracy = (round((t_pos + t_neg).item(), 2))
    
    #print('confusion : {}-{}-{}-{}'.format(t_pos, f_pos, f_neg, t_neg))
    #print('accuracy : {}'.format(accuracy))
    
    return t_pos, f_pos, f_neg, t_neg, accuracy
    
def val_model(model, predictor, label):
    total_acc = 0
    for sid, night in predictor:
        #print(sid, night)
        x = predictor[(sid, night)]
        y = torch.squeeze(label[(sid, night)])
        
        output = classifier(model(x[:, 1:]))
        output = torch.squeeze(torch.nn.functional.pad(output, (0, len(y) - len(output)), 'constant', 0))
        doutput = rm_dstrb(output, x.size(0) * dstrb_ratio)
        _, _, _, _, _ = confuse(sid, night, y, output)
        tp, fp, fn, tn, b_acc = confuse(sid, night, y, doutput)
        
        #print(intervals_f(doutput))
        
        """if a_acc > b_acc:
            print(a_acc, b_acc)
            for i in range(len(y)):
                print(output[i].item(), doutput[i].item(), y[i].item())"""
        
        total_acc += b_acc
    
    return total_acc

"""val_dataset = Data(x_val_seg, y_val_seg)
val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle = True, collate_fn = collate)"""

In [None]:
model = SVM(16).to('cuda')
optimiser = optim.SGD(model.parameters(), lr = eta_p)
model = train_model(model, optimiser, train_loader)

dstrb_ratio = 0.044 # 30 minutes or less of wake between sleeps can be inverted
"""for i in range(10, 20, 1):
    thres = i/10
    try:
        total_acc = val_model(model, x_val_seg, y_val_seg)
    except:
        print(i, 'oops')
    print(i/10, total_acc/len(x_val_seg))"""

thres = 1.1
"""for i in range(40, 60, 1):
    dstrb_ratio = i / 1000
    total_acc = val_model(model, x_val_seg, y_val_seg)
    print(i/1000, total_acc/len(x_val_seg))"""

total_acc = val_model(model, x_val_seg, y_val_seg)
print(total_acc/len(x_val_seg))

In [None]:
def test_model(model, series):
    results = {}
    ## empirical average of movement time : 8 steps
    wdw_size = 8
    ## hop between time segments; not equal to wdw_size to prevent boundary issues
    hop_size = 6
    
    for sid in range(3):
        data = series[series[:,0] == sid][:]
        data = data[:,1:]
        seg = segment(data.contiguous(), wdw_size, hop_size)
        x = stft(seg)
        output = torch.squeeze(classifier(model(x[:, 1:])))
        doutput = rm_dstrb(output, x.size(0) * dstrb_ratio)
        
        intv = intervals_f(output)[0]
        results[sid] = (round(x[intv[0]][0].item(),0), round(x[intv[1]][0].item(),0))
    
    return results

itern = 40

total_results = []

for _ in range(itern):
    model = SVM(16).to('cuda')
    optimiser = optim.SGD(model.parameters(), lr = eta_p)
    model = train_model(model, optimiser, train_loader)
    
    results = test_model(model, test_t)
    
    total_results.append(results)

In [None]:
sizsiz = len(total_results[0])
ids, avg, event, err = [0]*sizsiz*2, [0]*sizsiz*2, [0]*sizsiz*2, [0]*sizsiz*2
for t, data in enumerate(total_results):
    for i in range(sizsiz):
        avg[2*i] += data[i][0]
        ids[2*i] = test_id_list[i]
        event[2*i] = 'onset'
        avg[2*i+1] += data[i][1]
        event[2*i+1] = 'wakeup'
        ids[2*i+1] = test_id_list[i]
        err[2*i] += (data[i][0] - avg[2*i]/(t+1))**2
        err[2*i+1] += (data[i][1] - avg[2*i+1]/(t+1))**2
        
avg = np.array(avg) // len(total_results)
err = 1 - np.arctan(np.sqrt(np.array(err)) / len(total_results))/(np.pi / 2)

data = {
    'series_id' : ids,
    'step' : avg,
    'event' : event,
    'score' : err
}

finalframe = pd.DataFrame(data)
finalframe.index.name = 'row_id'

print(finalframe)

finalframe.to_csv('/kaggle/working/submission.csv')