In [1]:
import gc
import numpy as np
import pandas as pd
import polars as pl
from tqdm.auto import tqdm
from matplotlib import pyplot as plt
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from scipy.signal import find_peaks

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR
from torchinfo import summary
import os

In [2]:
INPUT_DIR = "../"
MODELS = "1layer_cosine"
MODEL_PATH = MODELS + "/"
# MODEL_PATH = "/kaggle/input/cmi-classifier/"
n_splits = 5
n_epochs = 50
batch_size = 16
features = ["log_anglez_std", 
            "log_enmo", 
            "valid_flag", 
            "min_mod_15_plus_1",
#             "hour_sin",
#             "hour_cos",
#             "day_of_week"
            ]
filters = 24
initial_channels_num = len(features) * filters
seed = 2
layer = 1

In [3]:
df = pl.read_parquet(INPUT_DIR + "train_series.parquet", n_rows=200000)
df = (
    df.with_columns(
        pl.col("timestamp").str.to_datetime(),
    ).with_columns(
        pl.col("timestamp").dt.date().cast(str).alias("date"),
        pl.col("timestamp").dt.time().cast(str).alias("time"),
    )
).to_pandas()

df_dummy_1 = pd.DataFrame(columns=sorted(df["time"].unique()), dtype="float32")
display(df_dummy_1)

try:
    df_dummy_2 = pd.read_parquet(MODEL_PATH + "df_mask.parquet").iloc[:0]
#     df_dummy_2 = pd.read_parquet("/kaggle/input/my-cmi-models/coswarm_72_log/" + "df_mask.parquet").iloc[:0]
except:
    # df_dummy_2 = df_mask.iloc[:0]
    pass

display(df_dummy_2)

Unnamed: 0,00:00:00,00:00:05,00:00:10,00:00:15,00:00:20,00:00:25,00:00:30,00:00:35,00:00:40,00:00:45,...,23:59:10,23:59:15,23:59:20,23:59:25,23:59:30,23:59:35,23:59:40,23:59:45,23:59:50,23:59:55


Unnamed: 0_level_0,time,00:00:00,00:01:00,00:02:00,00:03:00,00:04:00,00:05:00,00:06:00,00:07:00,00:08:00,00:09:00,...,23:50:00,23:51:00,23:52:00,23:53:00,23:54:00,23:55:00,23:56:00,23:57:00,23:58:00,23:59:00
series_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1


In [4]:
df_test = pl.read_parquet(f"{INPUT_DIR}train_series.parquet")

In [5]:
def data_process(df_series):  
    df_groupby = df_series.group_by("series_id", maintain_order=True)

    dict_valid_ratio = dict()
    list_feature_array = []
    list_df_1min = []
    for series_id, df in tqdm(df_groupby, total=df_series.get_column("series_id").n_unique()):
        df = (
            df.with_columns(
                pl.col("timestamp").str.to_datetime(),
            ).with_columns(
                pl.col("timestamp").dt.date().cast(str).alias("date"),
                pl.col("timestamp").dt.time().cast(str).alias("time"),
            )
        ).to_pandas()
        
        df["timestamp"] = df["timestamp"].dt.tz_localize(None)
        dup_count = df.groupby(["anglez", "enmo", "time"])["step"].transform("count")
        df["valid_flag"] = (dup_count == 1).astype("float32")
        dict_valid_ratio[series_id] = df["valid_flag"].mean()
        
        # feature engineering
        list_feature_array_tmp = []
        # add 1 to prevent to close to 0 or negative
        df["log_anglez_std"] = np.log(df["anglez"].rolling(25, min_periods=1, center=True).std() + 1).astype("float32")
        # add 0.01 to prevent to close to 0 or negative
        df["log_enmo"] = np.log(df["enmo"] + 0.01).astype("float32")

#         df["hour_sin"] = np.sin(2 * np.pi * df["timestamp"].dt.hour  / 24).astype("float32")
#         df["hour_cos"] = np.cos(2 * np.pi * df["timestamp"].dt.hour / 24).astype("float32")
        
        df["min_mod_15_plus_1"] = (df["timestamp"].dt.minute % 15 + 1).astype("float32")
        df["day_of_week"] = (df["timestamp"].dt.dayofweek + 1).astype("float32")

        # Signal Magnitude Area
#         df['sma'] = (df['anglez'].abs() + df['enmo'].abs()).rolling(window=25).sum().fillna(0).astype("float32")
        
        # Anglez Crossing
#         df['anglez_crossings'] = ((df['anglez'].shift(1) * df['anglez']) < 0).astype(int).rolling(window=12).sum() / 12
    
        # enmo_rolling mean
#         df['enmo_rolling'] = df['enmo'].abs().rolling(window=12).mean().fillna(0).astype("float32")

        # for every row of data, add last part of one day before and first part of one day after for better time series analysis
        for feature in features:
            df_pivot = df.pivot(index=["series_id", "date"], columns="time", values=feature)
            if df_pivot.shape[1] != df_dummy_1.shape[1]:
                df_pivot = pd.concat([df_dummy_1, df_pivot])
            feature_array = df_pivot.fillna(0).values
            feature_array_1day_bedore = df_pivot.shift(1).fillna(0).values
            feature_array_1day_after = df_pivot.shift(-1).fillna(0).values
            feature_array = np.concatenate([feature_array_1day_bedore[:, -180*12:], feature_array, feature_array_1day_after[:, :180*12]], axis=1)
            list_feature_array_tmp.append(feature_array)
        list_feature_array.append(np.stack(list_feature_array_tmp, axis=1))
        
        dict_agg = {"series_id": "first", "date": "first", "time": "first", "step": "mean", "valid_flag": "max"}
        df_1min = df.resample("1min", on="timestamp").agg(dict_agg).reset_index()
        df_1min["step"] = df_1min["step"].astype("int32")
        list_df_1min.append(df_1min)

    return list_df_1min, df_1min, list_feature_array, dict_valid_ratio

In [6]:
list_df_1min_test, df_1min_test, list_feature_array_test, dict_valid_ratio_test = data_process(df_test)

  0%|          | 0/277 [00:00<?, ?it/s]

In [7]:
X_test = np.concatenate(list_feature_array_test)
X_test = (X_test - X_test.min(axis=(0, 2), keepdims=True)) / (X_test.max(axis=(0, 2), keepdims=True) - X_test.min(axis=(0, 2), keepdims=True))

df_1min_test = pd.concat(list_df_1min_test)
df_mask_test = df_1min_test.pivot(index=["series_id", "date"], columns="time", values="valid_flag").fillna(0)
if df_mask_test.shape[1] != df_dummy_2.shape[1]:
    df_mask_test = pd.concat([df_dummy_2, df_mask_test]).fillna(0)
    
del list_df_1min_test, list_feature_array_test, df
gc.collect()

23

In [8]:
class MyDataset(Dataset):
    def __init__(self, X, Y, flag):
        self.X = torch.FloatTensor(X)
        if Y is not None:
            self.Y = torch.FloatTensor(Y)
        self.flag = torch.FloatTensor(flag)
    
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, idx):
        if "Y" in dir(self):
            return (self.X[idx], self.Y[idx], self.flag[idx])
        else:
            return (self.X[idx], torch.Tensor(), self.flag[idx])
        
class EarlyStopping:
    def __init__(self, patience=20, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0

In [9]:
class ConvBNReLU(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, groups=1):
        super().__init__()
        
        if stride == 1:
            padding = "same"
        else:
            padding = (kernel_size - stride) // 2
        self.layers = nn.Sequential(
            nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups),
            nn.BatchNorm1d(out_channels),
            nn.ReLU()
        )
    
    def forward(self, x):
        x_out = self.layers(x)
        return x_out


class SEBlock(nn.Module):
    def __init__(self, n_channels, se_ratio):
        super().__init__()
        
        self.layers = nn.Sequential(
            nn.AdaptiveAvgPool1d(output_size=1),  #  Global Average Pooling
            nn.Conv1d(n_channels, n_channels//se_ratio, kernel_size=1),
            nn.ReLU(),
            nn.Conv1d(n_channels//se_ratio, n_channels, kernel_size=1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x_out = torch.mul(x, self.layers(x))
        return x_out


class ResBlock(nn.Module):
    def __init__(self, n_channels, kernel_size, se_ratio):
        super().__init__()
        
        self.layers = nn.Sequential(
            ConvBNReLU(n_channels, n_channels, kernel_size, stride=1),
            ConvBNReLU(n_channels, n_channels, kernel_size, stride=1),
            SEBlock(n_channels, se_ratio)
        )
    
    def forward(self, x):
        x_re = self.layers(x)
        x_out = x + x_re
        return x_out
    

class UNet1d(nn.Module):
    def __init__(self, input_channels, initial_channels, initial_kernel_size,
                 down_channels, down_kernel_size, down_stride, res_depth, res_kernel_size, se_ratio, out_kernel_size):
        super().__init__()
        self.down_kernel_size = down_kernel_size
        self.down_stride = down_stride
        
        if layer == 1:
            self.initial_layers = ConvBNReLU(input_channels, initial_channels, initial_kernel_size, stride=1, groups=input_channels)
        if layer == 2:
            self.initial_layers = nn.Sequential(
                ConvBNReLU(input_channels, initial_channels, initial_kernel_size, stride=1, groups=input_channels),
                ConvBNReLU(initial_channels, initial_channels, initial_kernel_size, stride=1, groups=1)
            )
        
        self.down_layers = nn.ModuleList()
        for i in range(len(down_channels)):
            if i == 0:
                in_channels = initial_channels
            else:
                in_channels = down_channels[i-1] + input_channels
            out_channels = down_channels[i]
            kernel_size = down_kernel_size[i]
            stride = down_stride[i]
            
            block = []
            block.append(ConvBNReLU(in_channels, out_channels, kernel_size, stride))
            for j in range(res_depth):
                block.append(ResBlock(out_channels, res_kernel_size, se_ratio))
            self.down_layers.append(nn.Sequential(*block))
        
        self.up_layers = nn.ModuleList()
        for i in range(len(down_channels)-1, 0, -1):
            in_channels = out_channels + down_channels[i]
            out_channels = down_channels[i]
            kernel_size = down_kernel_size[i]
            self.up_layers.append(ConvBNReLU(in_channels, out_channels, kernel_size, stride=1))
        
        self.out_layers = nn.Conv1d(down_channels[1], 1, out_kernel_size, padding="same")
    
    def forward(self, x):
        outs = []
        x_avg = x
        x = self.initial_layers(x)
        
        for i in range(len(self.down_layers)):
            x_out = self.down_layers[i](x)
            if i == len(self.down_layers) - 1:
                x = x_out
            else:
                outs.append(x_out)
                kernel_size = self.down_kernel_size[i]
                stride = self.down_stride[i]
                padding = (kernel_size - stride) // 2
                x_avg = F.avg_pool1d(x_avg, kernel_size, stride, padding)
                x = torch.cat([x_out, x_avg], dim=1)
        
        for i in range(len(self.up_layers)):
            scale_factor = self.down_stride[-i-1]
            x = F.interpolate(x, scale_factor=scale_factor, mode="linear")
            x = torch.cat([x, outs[-i-1]], dim=1)
            x = self.up_layers[i](x)
        
        x_out = self.out_layers(x)
        x_out = x_out[:, 0, 180:-180]
        
        return x_out

In [10]:
def train(model, data_loader, optimizer, criterion, device):
    model.train()
    
    for batch in data_loader:
        X = batch[0].to(device)
        Y = batch[1].to(device)
        mask = batch[2].to(device)
        
        preds = model(X) * mask
        loss = criterion(preds, Y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


def evaluate(model, data_loader, criterion, device):
    model.eval()
    
    n = 0
    total_loss = 0.0
    for batch in data_loader:
        X = batch[0].to(device)
        Y = batch[1].to(device)
        mask = batch[2].to(device)
        
        with torch.no_grad():
            preds = model(X) * mask
        
        loss = criterion(preds, Y)
        total_loss += loss.item() * X.shape[0]
        n += X.shape[0]
    
    avg_loss = total_loss / n
    
    return avg_loss


def predict(model, data_loader, device):
    model.eval()
    
    preds_all = []
    for batch in data_loader:
        X = batch[0].to(device)
        mask = batch[2].to(device)
        
        with torch.no_grad():
            preds = model(X) * mask
        preds = preds.cpu().numpy()
        preds_all.append(preds)
    
    preds_all = np.concatenate(preds_all)
        
    return preds_all

In [11]:
preds_test1 = np.zeros_like(df_mask_test.values)
preds_test2 = np.zeros_like(df_mask_test.values)

ds_test = MyDataset(X_test, None, df_mask_test.values)  # No labels (Y) for test dataset
dl_test = DataLoader(ds_test, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True, drop_last=False)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = UNet1d(
        input_channels=X_test.shape[1],
        initial_channels=initial_channels_num,
        initial_kernel_size=15,
        down_channels=(initial_channels_num, initial_channels_num, initial_channels_num),
        down_kernel_size=(12, 15, 15),
        down_stride=(12, 9, 5),  # first element must be 12
        res_depth=3,
        res_kernel_size=15,
        se_ratio=8,
        out_kernel_size=21,
    )

# model_best.load_state_dict(torch.load(best_model_path))
model.to(device)
# model_best.eval() 

# preds_test = predict(model_best, dl_test, device)
for i in range(n_splits):
    for k in range(seed):
        with torch.no_grad():
            model.load_state_dict(torch.load(f"{MODEL_PATH}model_{k}_{i}.pth", map_location=device))
            if k == 0:
                preds_test1 += predict(model, dl_test, device) / n_splits
            else:
                preds_test2 += predict(model, dl_test, device) / n_splits

In [12]:
preds_test = (preds_test1 + preds_test2) / seed

def make_testset(preds_test):
    df_pred_test = pd.DataFrame(preds_test, index=df_mask_test.index, columns=df_mask_test.columns)
    df_pred_test = df_pred_test.stack().reset_index(name="score")
    df_pred_test = pd.merge(
        df_1min_test[["series_id", "date", "time", "step"]],
        df_pred_test,
        on=["series_id", "date", "time"],
        how="inner"
    )
    display(df_pred_test)
    return df_pred_test

df_pred_test1 = make_testset(preds_test1)
df_pred_test2 = make_testset(preds_test2)

Unnamed: 0,series_id,date,time,step,score
0,038441c925bb,2018-08-14,19:30:00,5,-0.032158
1,038441c925bb,2018-08-14,19:31:00,17,-0.035108
2,038441c925bb,2018-08-14,19:32:00,29,-0.032920
3,038441c925bb,2018-08-14,19:33:00,41,-0.013051
4,038441c925bb,2018-08-14,19:34:00,53,-0.006413
...,...,...,...,...,...
10662190,fe90110788d2,2017-09-08,04:10:00,592325,0.010233
10662191,fe90110788d2,2017-09-08,04:11:00,592337,-0.001389
10662192,fe90110788d2,2017-09-08,04:12:00,592349,-0.016908
10662193,fe90110788d2,2017-09-08,04:13:00,592361,-0.036923


Unnamed: 0,series_id,date,time,step,score
0,038441c925bb,2018-08-14,19:30:00,5,0.019585
1,038441c925bb,2018-08-14,19:31:00,17,0.011272
2,038441c925bb,2018-08-14,19:32:00,29,0.021437
3,038441c925bb,2018-08-14,19:33:00,41,0.027391
4,038441c925bb,2018-08-14,19:34:00,53,0.027633
...,...,...,...,...,...
10662190,fe90110788d2,2017-09-08,04:10:00,592325,0.078752
10662191,fe90110788d2,2017-09-08,04:11:00,592337,0.063644
10662192,fe90110788d2,2017-09-08,04:12:00,592349,0.048547
10662193,fe90110788d2,2017-09-08,04:13:00,592361,0.046995


In [13]:
# without pp
def sub_without_pp(df_pred_test):
    list_df = []
    for series_id, df in tqdm(df_pred_test.groupby("series_id")):
        for event in ["onset", "wakeup"]:
            values_step = df["step"].values
            if event == "onset":
                values_score = -df["score"].values
            else:
                values_score = df["score"].values

            peak_idx = find_peaks(values_score, height=0.0, distance=8)[0]
            df_peak = pd.DataFrame(values_step[peak_idx], columns=["step"])
            df_peak["series_id"] = series_id
            df_peak["event"] = event
            df_peak["score"] = values_score[peak_idx]
            list_df.append(df_peak)

    df_sub = pd.concat(list_df)
    df_sub = df_sub.sort_values("score", ascending=False).groupby("event").head(100000)  # avoid Submission Scoring Error
    df_sub = df_sub.sort_values(["series_id", "step"]).reset_index(drop=True)
    df_sub = df_sub[["series_id", "step", "event", "score"]].reset_index(names="row_id")
    # df_sub.to_csv("df_sub_without_pp.csv", index=False)
    display(df_sub)
    return df_sub
    
df_sub1 = sub_without_pp(df_pred_test1)
df_sub2 = sub_without_pp(df_pred_test2)

  0%|          | 0/277 [00:00<?, ?it/s]

Unnamed: 0,row_id,series_id,step,event,score
0,0,038441c925bb,17,onset,0.035108
1,1,038441c925bb,113,onset,0.010218
2,2,038441c925bb,221,onset,0.010334
3,3,038441c925bb,389,onset,0.002909
4,4,038441c925bb,485,onset,0.001053
...,...,...,...,...,...
199995,199995,fe90110788d2,587477,wakeup,0.001409
199996,199996,fe90110788d2,590693,onset,0.001000
199997,199997,fe90110788d2,591557,onset,0.001777
199998,199998,fe90110788d2,591881,onset,0.002373


  0%|          | 0/277 [00:00<?, ?it/s]

Unnamed: 0,row_id,series_id,step,event,score
0,0,038441c925bb,53,wakeup,0.027633
1,1,038441c925bb,185,wakeup,0.016955
2,2,038441c925bb,329,wakeup,0.015118
3,3,038441c925bb,485,wakeup,0.010956
4,4,038441c925bb,665,wakeup,0.012644
...,...,...,...,...,...
199995,199995,fe90110788d2,591725,wakeup,0.003184
199996,199996,fe90110788d2,591881,wakeup,0.004404
199997,199997,fe90110788d2,592097,wakeup,0.022978
199998,199998,fe90110788d2,592193,wakeup,0.045387


In [14]:
# with pp

def sub_with_pp(df_pred_test):
    # pp setup
    df_events = pd.read_csv(INPUT_DIR + "train_events.csv").dropna()
    df_events["timestamp"] = pd.to_datetime(df_events["timestamp"], utc=True).dt.tz_localize(None)
    df_events["time"] = df_events["timestamp"].dt.time.astype(str)
    df_events["minute_mod15"] = df_events["timestamp"].dt.minute % 15

    df_agg = df_events.groupby(["time", "event"], as_index=False).size()
    df_agg["rate"] = df_agg["size"] / df_agg.groupby("event")["size"].transform("sum") * (60*24)
    df_time = df_agg.pivot(index="time", columns="event", values="rate").fillna(0).reset_index()
    df_time = df_time.merge(df_pred_test[["time"]].drop_duplicates(), how="right").fillna(0)
    df_time = pd.concat([df_time]*3, ignore_index=True)
    df_time["onset"] = df_time["onset"].rolling(60, center=True).mean()
    df_time["wakeup"] = df_time["wakeup"].rolling(60, center=True).mean()
    df_time = df_time.iloc[60*24:-60*24].reset_index(drop=True)

    df_agg = df_events.groupby(["minute_mod15", "event"], as_index=False).size()
    df_agg["rate"] = df_agg["size"] / df_agg.groupby("event")["size"].transform("sum") * 15
    df_minute = df_agg.pivot(index="minute_mod15", columns="event", values="rate").reset_index()

    df_agg = df_events.groupby(["minute_mod15", "event"], as_index=False).size()
    df_agg["rate"] = df_agg["size"] / df_agg.groupby("event")["size"].transform("sum") * 15
    df_minute = df_agg.pivot(index="minute_mod15", columns="event", values="rate").reset_index()

    df_time[["onset", "wakeup"]] = df_time[["onset", "wakeup"]].clip(0.1, 1.1) ** 0.13
    df_minute[["onset", "wakeup"]] = df_minute[["onset", "wakeup"]].clip(0.5, 1.3) ** 0.06
    
    df_pred_test["minute_mod15"] = df_pred_test["time"].str[3:5].astype(int) % 15

    list_df = []
    for series_id, df in tqdm(df_pred_test.groupby("series_id")):
        df = df.merge(df_time, how="left", on="time")
        df = df.merge(df_minute, how="left", on="minute_mod15")

        df_tmp = df.copy()
        df_tmp["score"] = df_tmp["score"].replace(0.0, np.nan)
        df_tmp = df_tmp.groupby("time")["score"].mean()
        df_tmp = pd.concat([df_tmp]*3).rolling(90, center=True, min_periods=1).mean()
        df_tmp = df_tmp.iloc[60*24:-60*24].reset_index().rename({"score": "score_mean"}, axis=1)
        df = df.merge(df_tmp, on="time", how="left")

        df["score"] = 0.9*df["score"] + 0.1*df["score_mean"]
        df["score"] *= np.where(df["score"]>0, df["wakeup_x"], df["onset_x"])
        df["score"] *= np.where(df["score"]>0, df["wakeup_y"], df["onset_y"])
        valid_ratio = dict_valid_ratio_test[series_id]

        for event in ["onset", "wakeup"]:
            values_step = df["step"].values
            if event == "onset":
                values_score = -df["score"].values
            else:
                values_score = df["score"].values

            # measure peaks
            peak_idx = find_peaks(values_score, height=0.04, distance=60*16)[0]  # at least 16 hours interval
            df_measure_peak = pd.DataFrame(values_step[peak_idx], columns=["step"])
            df_measure_peak["series_id"] = series_id
            df_measure_peak["event"] = event
            df_measure_peak["score"] = values_score[peak_idx] * 4 * valid_ratio**0.15

            # minor peaks
            peak_idx = find_peaks(values_score, height=0.0, distance=6)[0]
            df_minor_peak = pd.DataFrame(values_step[peak_idx], columns=["step"])
            df_minor_peak["series_id"] = series_id
            df_minor_peak["event"] = event
            df_minor_peak["score"] = values_score[peak_idx]

            df_peak = pd.concat([df_measure_peak, df_minor_peak]).drop_duplicates(subset=["step"])
            list_df.append(df_peak)

    df_sub = pd.concat(list_df)
    df_sub = df_sub.sort_values("score", ascending=False).groupby("event").head(100000)  # avoid Submission Scoring Error
    df_sub = df_sub.sort_values(["series_id", "step"]).reset_index(drop=True)
    df_sub = df_sub[["series_id", "step", "event", "score"]].reset_index(names="row_id")
#     df_sub.to_csv("df_sub_with_pp.csv", index=False)
    display(df_sub)
    
    return df_sub

# df_sub1 = sub_with_pp(df_pred_test1)
# df_sub2 = sub_with_pp(df_pred_test2)

In [15]:
# df_result = pd.read_csv("df_sub_with_pp.csv")
df_true = pd.read_csv(INPUT_DIR + "train_events.csv").dropna()
df_true['step'] = df_true['step'].astype(int)
df_true = df_true[['series_id', 'night', 'event', 'step']].copy()
df_result1 = df_sub1[['series_id', 'event', 'step', 'score']].copy()
df_result2 = df_sub2[['series_id', 'event', 'step', 'score']].copy()

In [16]:
tolerances = [0, 12, 36, 60, 90, 120, 150, 180, 240, 300]
correctness = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

def categorize_error(error, tolerances):
    for idx, tolerance in enumerate(reversed(tolerances)):
        if error > tolerance:
            return (f'>{tolerance} steps', correctness[idx])
        
def get_grouped_dfs(df):
    grouped_dfs = []

    for series_id, group in df.groupby('series_id'):
        # Sort by night, event (onset before wakeup), and score
        sorted_group = group.sort_values(['night', 'event', 'score'], ascending=[True, True, False])
        grouped_dfs.append(sorted_group)

    return grouped_dfs

def calculate_top_6_score_and_target(df):
    # Sort the DataFrame by night, event, and score in descending order
    df_sorted = df.sort_values(by=['night', 'event', 'score'], ascending=[True, True, False])

    # Group by series_id, night, and event
    grouped = df_sorted.groupby(['night', 'event'])

    # Get top 6 rows for each group
    top_6_df = grouped.head(6).copy()

    # Calculate the cumulative max of the correctness to find the highest correctness so far
    top_6_df['top_6_score'] = grouped['correctness'].cummax()
    top_6_df['target'] = (top_6_df['top_6_score'] > 0.5).astype(int)

    return top_6_df

def make_dataset(df_true, df_result):
    # Add a tolerance range to df_true
    df_true_ = df_true
    df_true_['step_min'] = df_true['step'] - 360
    df_true_['step_max'] = df_true['step'] + 360

    # Initialize a list to hold the results
    matched_results = []

    # Iterate through df_true DataFrame
    for idx, row in tqdm(df_true_.iterrows()):
        series_id = row['series_id']
        event = row['event']
        step_min = row['step_min']
        step_max = row['step_max']
        night = row['night']

        # Find matching rows in df_result within the step range for the same series_id and event
        matches = df_result[
            (df_result['series_id'] == series_id) &
            (df_result['event'] == event) &
            (df_result['step'] >= step_min) &
            (df_result['step'] <= step_max)
        ]

        # If matches are found, append them to the results list
        for match_idx, match_row in matches.iterrows():
            matched_results.append({
                'series_id': series_id,
                'event': event,
                'step_true': row['step'],
                'step_pred': match_row['step'],
                'score': match_row['score'],  # Assuming df_result has a 'score' column
                'night': row['night']
            })

    # Convert the matched results to a DataFrame
    df_matched_results = pd.DataFrame(matched_results)
    df_matched_results['error'] = (df_matched_results['step_pred'] - df_matched_results['step_true']).abs()
    df_matched_results['error_category'] = df_matched_results['error'].apply(lambda x: categorize_error(x, tolerances)[0])
    df_matched_results['correctness'] = df_matched_results['error'].apply(lambda x: categorize_error(x, tolerances)[1])
    
    grouped_dfs = get_grouped_dfs(df_matched_results)
    
    final_dfs = []
    for df in grouped_dfs:
        df_new = calculate_top_6_score_and_target(df)
        df_new['daily_step'] = df_new['step_pred'] % (12*60*24) / (12*60*24)
        df_new['top_6_score_diff'] = df_new.groupby('step_true')['top_6_score'].diff()
        df_new['top_6_score_diff'] = df_new['top_6_score_diff'].fillna(df_new['top_6_score'])
        final_dfs.append(df_new.reset_index(drop=True))
        
    return final_dfs

final_dfs1 = make_dataset(df_true, df_result1)
final_dfs2 = make_dataset(df_true, df_result2)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [17]:
lengthlist = [12, 24, 60, 120, 240, 360, 720]
score_keys = ['score']

def calculate_window_features(group, df_pred):
    # scores = group['score'].values
    steps = group['step_pred'].values

    for length in lengthlist:

        before_means = []
        after_means = []
    
        for step in steps:
            # Indices for window before and after
            before_indices = (df_pred['step'] < step) & (df_pred['step'] >= step - length)
            after_indices = (df_pred['step'] > step) & (df_pred['step'] <= step + length)
            
            # Calculate mean scores
            before_mean = df_pred.loc[before_indices, 'score'].mean()
            after_mean = df_pred.loc[after_indices, 'score'].mean()
            
            before_means.append(before_mean)
            after_means.append(after_mean)
        
        group[f'before_states_feat_{length}'] = before_means
        group[f'after_states_feat_{length}'] = after_means
    
    return group

def df_before(final_dfs, df_pred_test):
    result_df = pd.DataFrame()
    # for (series_id, event), group in df.groupby(['series_id', 'event']):
    #     matched_df_pred = df_pred[df_pred['series_id'] == series_id]
    #     print(matched_df_pred)
    #     result_df = pd.concat([result_df, calculate_window_features(group, matched_df_pred)])

    for group in tqdm(final_dfs):
        matched_df_pred = df_pred_test[df_pred_test['series_id'] == group['series_id'].reset_index(drop=True)[0]]
        # print(calculate_window_features(group, matched_df_pred))
        result_df = pd.concat([result_df, calculate_window_features(group, matched_df_pred)], axis=0)

    result_df = result_df.reset_index(drop=True)
    display(result_df)
    return result_df

result_df1 = df_before(final_dfs1, df_pred_test1)
result_df2 = df_before(final_dfs2, df_pred_test2)

  0%|          | 0/268 [00:00<?, ?it/s]

Unnamed: 0,series_id,event,step_true,step_pred,score,night,error,error_category,correctness,top_6_score,...,before_states_feat_60,after_states_feat_60,before_states_feat_120,after_states_feat_120,before_states_feat_240,after_states_feat_240,before_states_feat_360,after_states_feat_360,before_states_feat_720,after_states_feat_720
0,038441c925bb,onset,4992,4997,0.769066,1,5,>0 steps,1.0,1.0,...,-0.521444,-0.386452,-0.318329,-0.227040,-0.164331,-0.114996,-0.109655,-0.076079,-0.054925,-0.038640
1,038441c925bb,onset,4992,4757,0.003897,1,235,>180 steps,0.3,1.0,...,0.000383,-0.002390,-0.000304,-0.014413,-0.000198,-0.202589,-0.000112,-0.210739,0.000145,-0.105794
2,038441c925bb,wakeup,10932,10925,0.842525,1,7,>0 steps,1.0,1.0,...,0.394206,0.491188,0.240096,0.288312,0.126811,0.148402,0.084508,0.100649,0.042589,0.051945
3,038441c925bb,wakeup,10932,11285,0.013798,1,353,>300 steps,0.1,1.0,...,0.007017,0.008946,0.003831,0.005388,0.007570,0.002801,0.128274,0.003242,0.106391,0.002539
4,038441c925bb,wakeup,10932,11105,0.011673,1,173,>150 steps,0.4,1.0,...,0.017926,0.002490,0.072459,0.001659,0.287873,0.005431,0.209956,0.003972,0.105665,0.003220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32641,fe90110788d2,onset,574620,574457,0.022969,34,163,>150 steps,0.4,1.0,...,-0.007172,-0.030351,-0.002560,-0.100532,0.000303,-0.244834,0.000411,-0.182024,-0.001595,-0.096546
32642,fe90110788d2,wakeup,581604,581597,0.703304,34,7,>0 steps,1.0,1.0,...,0.355643,0.399337,0.220630,0.235548,0.115968,0.140884,0.077888,0.099171,0.038919,0.049673
32643,fe90110788d2,wakeup,581604,581765,0.061467,34,161,>150 steps,0.4,1.0,...,0.044661,0.047564,0.090918,0.035434,0.255340,0.021384,0.183419,0.014223,0.092084,0.007159
32644,fe90110788d2,wakeup,581604,581897,0.019712,34,293,>240 steps,0.2,1.0,...,0.023304,0.010003,0.035434,0.005153,0.054252,0.002673,0.179564,0.001890,0.099143,0.001010


  0%|          | 0/268 [00:00<?, ?it/s]

Unnamed: 0,series_id,event,step_true,step_pred,score,night,error,error_category,correctness,top_6_score,...,before_states_feat_60,after_states_feat_60,before_states_feat_120,after_states_feat_120,before_states_feat_240,after_states_feat_240,before_states_feat_360,after_states_feat_360,before_states_feat_720,after_states_feat_720
0,038441c925bb,onset,4992,4997,0.788360,1,5,>0 steps,1.0,1.0,...,-0.548552,-0.423773,-0.337613,-0.250828,-0.175935,-0.131120,-0.116636,-0.089170,-0.057733,-0.045584
1,038441c925bb,onset,4992,4793,0.013220,1,199,>180 steps,0.3,1.0,...,-0.001206,-0.015643,0.000414,-0.051599,0.000010,-0.295131,0.000537,-0.227788,0.001166,-0.116366
2,038441c925bb,onset,4992,5201,0.010572,1,209,>180 steps,0.3,1.0,...,-0.010849,-0.008577,-0.032260,-0.006988,-0.268215,-0.004939,-0.227635,-0.003895,-0.114387,-0.002791
3,038441c925bb,onset,4992,5321,0.005658,1,329,>300 steps,0.1,1.0,...,-0.005658,-0.003265,-0.007479,-0.002890,-0.019869,-0.002348,-0.181303,-0.002129,-0.115820,-0.001856
4,038441c925bb,wakeup,10932,10925,0.763843,1,7,>0 steps,1.0,1.0,...,0.395359,0.431007,0.234070,0.249460,0.118691,0.127336,0.077894,0.084487,0.038033,0.045149
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33704,fe90110788d2,onset,574620,574925,0.043047,34,305,>300 steps,0.1,1.0,...,-0.045063,-0.030511,-0.045165,-0.023477,-0.064831,-0.014467,-0.182505,-0.010427,-0.101595,-0.006041
33705,fe90110788d2,onset,574620,574457,0.034788,34,163,>150 steps,0.4,1.0,...,-0.017458,-0.036429,-0.008511,-0.100992,-0.002933,-0.249960,-0.002609,-0.187880,-0.002283,-0.105616
33706,fe90110788d2,wakeup,581604,581597,0.675653,34,7,>0 steps,1.0,1.0,...,0.362087,0.371103,0.224349,0.231791,0.113017,0.139787,0.074637,0.100341,0.036128,0.051260
33707,fe90110788d2,wakeup,581604,581801,0.053962,34,197,>180 steps,0.3,1.0,...,0.050065,0.034504,0.063823,0.028303,0.234879,0.018066,0.186076,0.013080,0.091436,0.006665


In [18]:
def make_features(result_df):
    """
    一部の処理はスコア上位のみのdfに対して実行してもいいかも？
    """
    df = result_df

    drop_cols = []
    added_cols = []
    # score_keys = ['score']
    score_keys = []
    df["event"] = ((df["event"] == "wakeup") | (df["event"] == 1)).astype(int)
    # daily_step for night
    df["daily_step_sleep"] = (df["daily_step"] + 0.5) % 1


    # change of state between before and after
    lengthlist = [12, 24, 60, 120, 240, 360, 720]
    for length in lengthlist:
        df[f"state_diff_{length}"] = df[f"before_states_feat_{length}"] - df[f"after_states_feat_{length}"]
        # df[f"nan_diff_{length}"] = df[f"before_nan_feat_{length}"] - df[f"after_nan_feat_{length}"]


    # largest score
    for key in score_keys:
        df[f"max_{key}_sne"] = df.groupby(["series_id", "night", "event"])[key].transform("max")
        df[f"max_{key}_sne_diff"] = df[f"max_{key}_sne"] - df[key]
        df[f"max_{key}_sne_is_peak"] = (df[f"max_{key}_sne_diff"] == 0).astype(int)
        df[f"sum_{key}_sne"] = df.groupby(["series_id", "night", "event"])[key].transform("sum")
        df[f"mean_{key}_sne"] = df.groupby(["series_id", "night", "event"])[key].transform("mean")
        drop_cols.append(f"max_{key}_sne_is_peak")
        added_cols += [f"max_{key}_sne", f"max_{key}_sne_diff", f"max_{key}_sne_is_peak"]


    # largest score
    for key in score_keys:
        df[f"max_{key}_sn"] = df.groupby(["series_id", "night"])[key].transform("max")
        df[f"max_{key}_sn_diff"] = df[f"max_{key}_sn"] - df[key]
        # df[f"max_{key}_sn_rel"] = df[f"max_{key}_sn_diff"] / df[f"max_{key}_sn"]
        added_cols += [f"max_{key}_sn", f"max_{key}_sn_diff"]

    
    
    for key in score_keys:
        df_peak = df[df[f"max_{key}_sne_is_peak"] == 1]
        
        df_peak = df_peak.groupby(["series_id", "event"])[f"max_{key}_sne"].agg(["mean", "std"]).reset_index()
        df_peak.columns = ["series_id", "event", f"max_{key}_sne_mean", f"max_{key}_sne_std"]
        df = df.merge(df_peak, on=["series_id", "event"], how="left")
        # normalize
        df[f"{key}_relative_to_peak"] = (df[key] - df[f"max_{key}_sne_mean"]) / df[f"max_{key}_sne_std"]

        added_cols += [f"max_{key}_sne_mean", f"max_{key}_sne_std", f"{key}_relative_to_peak"]

   
    
    # daily_step at peak
    for key in score_keys:
        df_peak = df[df[f"max_{key}_sne_is_peak"] == 1].copy()
        df_peak = df_peak.rename(columns={"daily_step": f"peak_daily_step_{key}", "daily_step_sleep": f"peak_daily_step_sleep_{key}"})
        df_peak[f"peak_daily_step_{key}_mean"] = df_peak.groupby(["series_id", "event"])[f"peak_daily_step_{key}"].transform("mean")
        df_peak[f"peak_daily_step_sleep_{key}_mean"] = df_peak.groupby(["series_id", "event"])[f"peak_daily_step_sleep_{key}"].transform("mean") # scoreが高いものだけに限定してもいいのかもな…。
        # 以下はフリップしてから実施。　df_peak[f"peak_daily_step_{key}_mean_sleep"] = df_peak[f"peak_daily_step_{key}_mean"] + 0.5 - df_peak[f"peak_daily_step_sleep_{key}_mean"]

        df = df.merge(df_peak[["series_id", "night", "event", f"peak_daily_step_{key}", f"peak_daily_step_sleep_{key}", f"peak_daily_step_{key}_mean", f"peak_daily_step_sleep_{key}_mean"]], on=["series_id", "night", "event"], how="left")
        df[f"step_dist_from_peak_{key}"] = df["daily_step"] - df[f"peak_daily_step_{key}"]
        df[f"step_dist_from_peak_sleep_{key}"] = df["daily_step_sleep"] - df[f"peak_daily_step_sleep_{key}"]
        df[f"step_dist_from_peak_{key}_mean"] = df["daily_step"] - df[f"peak_daily_step_{key}_mean"]
        df[f"step_dist_from_peak_sleep_{key}_mean"] = df["daily_step_sleep"] - df[f"peak_daily_step_sleep_{key}_mean"]

        added_cols += [f"peak_daily_step_{key}", f"peak_daily_step_sleep_{key}", f"peak_daily_step_{key}_mean", f"peak_daily_step_sleep_{key}_mean", f"step_dist_from_peak_{key}", f"step_dist_from_peak_sleep_{key}", f"step_dist_from_peak_{key}_mean", f"step_dist_from_peak_sleep_{key}_mean"]
        

    # opposite event at same night
    # df_flip = df.copy()
    # flip_columns = [f"max_{key}_sne" for key in score_keys]+ [f"sum_{key}_sne" for key in score_keys] + [f"peak_daily_step_{key}" for key in score_keys] + [f"peak_daily_step_sleep_{key}" for key in score_keys]
    # df_flip["event"] = 1 - df_flip["event"] #.apply(lambda x: "onset" if x == "wakeup" else "wakeup")
    # df_flip = df_flip.groupby(["series_id", "event", "night"])[flip_columns].max().reset_index()
    # df_flip.columns = ["series_id", "event", "night"] + [f"{c}_flip" for c in flip_columns]
    # df = df.merge(df_flip, on=["series_id", "night", "event"], how="left")

    # for key in score_keys:
    #     df[f"peak_daily_step_{key}_sleep_duration_01"] = df["daily_step"] + 0.5 - df[f"peak_daily_step_sleep_{key}_flip"]
    #     df[f"peak_daily_step_{key}_sleep_duration_10"] = df[f"peak_daily_step_{key}_flip"] + 0.5 - df["daily_step_sleep"]
    #     added_cols += [f"peak_daily_step_{key}_sleep_duration_01", f"peak_daily_step_{key}_sleep_duration_10"]


    for key in score_keys:
        df[f"rank_{key}_sne"] = df.groupby(["series_id", "night", "event"])[key].transform("rank")
        

    
    # sort by scoreでsort -> accumulated score
    ###
    # df = df.sort_values(["series_id", "night", "event", "score"], ascending=False).reset_index(drop=True)
    # for key in score_keys:
    #     df[f"cumsum_{key}_sne"] = df.groupby(["series_id", "night", "event"])[key].transform("cumsum")
    ###
        # score diff
        # df[f"diff_{key}_sne"] = df.groupby(["series_id", "night", "event"])[key].diff()

        # 累積の積ももとめる。非透過率のような感じに        
        # max_val = df[key].max()
        # max_val = df.groupby(["series_id", "night", "event"])[key].transform("max")
    #     df["notpass"] = np.clip(df[key]/df[f"max_{key}_se"], 0, 1)
    #     df["pass"] = 1- df["notpass"]
    #     df[f"cumprod_{key}_sne_max"] = df.groupby(["series_id", "night", "event"])["pass"].transform("cumprod")
    #     df[f"cumprod_{key}_sne_max"] = - df.groupby(["series_id", "night", "event"])[f"cumprod_{key}_sne_max"].diff()
    #     df[f"cumprod_{key}_sne_max"] = df[f"cumprod_{key}_sne_max"].fillna(df["notpass"])
        
    # df = df.drop(columns=["notpass", "pass"])
    
    return df

In [19]:
df_final1 = make_features(result_df1)
df_final2 = make_features(result_df2)

In [20]:
# Drop columns we don't need for training phase
drop_cols = ['series_id', 
             'step_true', 
             'step_pred', 
             'night', 
             'error', 
             'error_category', 
             'top_6_score', 'target', 
            #  'max_score_sne_std', 
            #  'score_relative_to_peak', 
             'top_6_score_diff',
            #  'daily_step',
            ]

df_final1 = df_final1.drop(columns=drop_cols)
df_final2 = df_final2.drop(columns=drop_cols)

In [21]:
df_final1.columns

Index(['event', 'score', 'correctness', 'daily_step', 'before_states_feat_12',
       'after_states_feat_12', 'before_states_feat_24', 'after_states_feat_24',
       'before_states_feat_60', 'after_states_feat_60',
       'before_states_feat_120', 'after_states_feat_120',
       'before_states_feat_240', 'after_states_feat_240',
       'before_states_feat_360', 'after_states_feat_360',
       'before_states_feat_720', 'after_states_feat_720', 'daily_step_sleep',
       'state_diff_12', 'state_diff_24', 'state_diff_60', 'state_diff_120',
       'state_diff_240', 'state_diff_360', 'state_diff_720'],
      dtype='object')

In [22]:
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder



params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'seed': 42,
    'device': 'gpu',
    'gpu_platform_id': 0, 
    'gpu_device_id': 0,
#     'learning_rate': 0.001
}

def training(df_final, version, n_splits=10):
    
    X = df_final.drop(['correctness'], axis=1)
    y = df_final['correctness'] 
    y_pred = np.zeros_like(y)

    X = X.astype('float32')
    X['event'] = X['event'].astype('category')
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    for fold, (idx_train, idx_valid) in enumerate(kf.split(X, y)):
        print(f"Fold: {fold}")
        X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
        callbacks = [lgb.log_evaluation(period=50), 
                     lgb.early_stopping(stopping_rounds=100), 
                     lgb.reset_parameter(learning_rate=lambda i: 0.01 * (np.exp(-0.0001*i)))]

        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=2000,
                        valid_sets=[lgb_eval],
                        callbacks=callbacks,
                        )

        y_pred[idx_valid] = gbm.predict(X_valid, num_iteration=gbm.best_iteration)
        rmse_fold = np.sqrt(mean_squared_error(y_valid, y_pred[idx_valid]))
        gbm.save_model(f'gbm_{version}_{fold}.bin')
        print(f"Fold RMSE: {rmse_fold}")
        print()

    rmse = np.sqrt(mean_squared_error(y, y_pred))
    print(f"Final RMSE: {rmse}")

    return y_pred

In [23]:
y_pred1 = training(df_final1, 1)
y_pred2 = training(df_final2, 2)

Fold: 0
[LightGBM] [Info] Start training from score 0.469630
Training until validation scores don't improve for 100 rounds
[50]	valid_0's rmse: 0.23278
[100]	valid_0's rmse: 0.181179
[150]	valid_0's rmse: 0.156668
[200]	valid_0's rmse: 0.14566
[250]	valid_0's rmse: 0.140909
[300]	valid_0's rmse: 0.138642
[350]	valid_0's rmse: 0.137728
[400]	valid_0's rmse: 0.137246
[450]	valid_0's rmse: 0.136927
[500]	valid_0's rmse: 0.136681
[550]	valid_0's rmse: 0.136538
[600]	valid_0's rmse: 0.136459
[650]	valid_0's rmse: 0.13641
[700]	valid_0's rmse: 0.13638
[750]	valid_0's rmse: 0.136336
[800]	valid_0's rmse: 0.136356
[850]	valid_0's rmse: 0.136312
[900]	valid_0's rmse: 0.136294
[950]	valid_0's rmse: 0.136311
Early stopping, best iteration is:
[878]	valid_0's rmse: 0.13627
Fold RMSE: 0.136270268169213

Fold: 1
[LightGBM] [Info] Start training from score 0.470859
Training until validation scores don't improve for 100 rounds
[50]	valid_0's rmse: 0.232878
[100]	valid_0's rmse: 0.183768
[150]	valid_0'

In [24]:
print(y_pred1)
print(y_pred2)

[0.93965877 0.27261073 0.97643974 ... 0.40261806 0.2124038  0.1168746 ]
[0.94702848 0.32435837 0.29447565 ... 0.93203921 0.37443832 0.2215861 ]


In [30]:
def comb_with_stage1model(result_df, y_pred):
    df_final = make_features(result_df)
    df_output = df_final[['series_id', 'event', 'night', 'step_pred', 'score']].copy()
    df_output['event'] = df_output['event'].apply(lambda x: 'wakeup' if x == 1 else 'onset')
    df_output['score'] = df_output['score'] * y_pred
    df_output.rename(columns={"step_pred": "step"}, inplace=True)
    df_output['step'] = df_output['step'].astype('int64')
#     df_output.to_csv('stage2pred.csv')
    return df_output

def comb_with_stage1model_without_night(result_df, y_pred):
    df_final = make_features(result_df)
    df_output = df_final[['series_id', 'event', 'step_pred', 'score']].copy()
    df_output['event'] = df_output['event'].apply(lambda x: 'wakeup' if x == 1 else 'onset')
    df_output['score'] = df_output['score'] * y_pred
    df_output.rename(columns={"step_pred": "step"}, inplace=True)
    df_output['step'] = df_output['step'].astype('int64')
#     df_output.to_csv('stage2pred.csv')
    display(df_output)
    return df_output

# df1 = comb_with_stage1model(result_df1, y_pred1)
# df2 = comb_with_stage1model(result_df2, y_pred2)
df1 = comb_with_stage1model_without_night(result_df1, y_pred1)
df2 = comb_with_stage1model_without_night(result_df2, y_pred2)

Unnamed: 0,series_id,event,step,score
0,038441c925bb,onset,4997,0.722660
1,038441c925bb,onset,4757,0.001062
2,038441c925bb,wakeup,10925,0.822675
3,038441c925bb,wakeup,11285,0.001636
4,038441c925bb,wakeup,11105,0.004229
...,...,...,...,...
32641,fe90110788d2,onset,574457,0.008976
32642,fe90110788d2,wakeup,581597,0.660669
32643,fe90110788d2,wakeup,581765,0.024748
32644,fe90110788d2,wakeup,581897,0.004187


Unnamed: 0,series_id,event,step,score
0,038441c925bb,onset,4997,0.746600
1,038441c925bb,onset,4793,0.004288
2,038441c925bb,onset,5201,0.003113
3,038441c925bb,onset,5321,0.000644
4,038441c925bb,wakeup,10925,0.748880
...,...,...,...,...
33704,fe90110788d2,onset,574925,0.008282
33705,fe90110788d2,onset,574457,0.013647
33706,fe90110788d2,wakeup,581597,0.629735
33707,fe90110788d2,wakeup,581801,0.020205


In [26]:
df_combined = pd.concat([df1, df2]).sort_values(by=['series_id', 'step', 'night'])

def aggregate_scores(group):
    max_score = group['score'].max()
    return pd.Series({'score': max_score})

df_ensembled = df_combined.groupby(['series_id', 'step', 'event', 'night']).apply(aggregate_scores).reset_index()
# df_ensembled.to_csv('submission.csv')
df_ensembled

  df_ensembled = df_combined.groupby(['series_id', 'step', 'event', 'night']).apply(aggregate_scores).reset_index()


Unnamed: 0,series_id,step,event,night,score
0,038441c925bb,4757,onset,1,0.001062
1,038441c925bb,4793,onset,1,0.004288
2,038441c925bb,4997,onset,1,0.746600
3,038441c925bb,5201,onset,1,0.003113
4,038441c925bb,5321,onset,1,0.000644
...,...,...,...,...,...
53090,fe90110788d2,581273,wakeup,34,0.000373
53091,fe90110788d2,581597,wakeup,34,0.660669
53092,fe90110788d2,581765,wakeup,34,0.024748
53093,fe90110788d2,581801,wakeup,34,0.020205


In [31]:
df_combined_test = pd.concat([df1, df2]).sort_values(by=['series_id', 'step'])
df_ensembled_test = df_combined_test.groupby(['series_id', 'step', 'event']).apply(aggregate_scores).reset_index().reset_index().head(100000).rename({"index":"row_id"}, axis=1)
# df_ensembled_test.to_csv('submission.csv')
df_ensembled_test

  df_ensembled_test = df_combined_test.groupby(['series_id', 'step', 'event']).apply(aggregate_scores).reset_index().reset_index().head(100000).rename({"index":"row_id"}, axis=1)


Unnamed: 0,row_id,series_id,step,event,score
0,0,038441c925bb,4757,onset,0.001062
1,1,038441c925bb,4793,onset,0.004288
2,2,038441c925bb,4997,onset,0.746600
3,3,038441c925bb,5201,onset,0.003113
4,4,038441c925bb,5321,onset,0.000644
...,...,...,...,...,...
53090,53090,fe90110788d2,581273,wakeup,0.000373
53091,53091,fe90110788d2,581597,wakeup,0.660669
53092,53092,fe90110788d2,581765,wakeup,0.024748
53093,53093,fe90110788d2,581801,wakeup,0.020205


In [27]:
df_ensembled_sorted = df_ensembled.sort_values(by=['series_id', 'night', 'event', 'score'], ascending=[True, True, True, False])

# Step 4: Drop duplicates keeping the highest score for each 'series_id', 'night', and 'event'
df_final_sub = df_ensembled_sorted.drop_duplicates(subset=['series_id', 'night', 'event'], keep='first')

# Step 5: Sorting the final DataFrame for readability (optional)
df_final_sub = df_final_sub.sort_values(by=['series_id', 'night', 'step']).reset_index(drop=True).drop(['night'], axis=1)
df_final_sub.to_csv('submission.csv')
df_final_sub

Unnamed: 0,series_id,step,event,score
0,038441c925bb,4997,onset,0.746600
1,038441c925bb,10925,wakeup,0.822675
2,038441c925bb,20249,onset,0.472503
3,038441c925bb,27437,wakeup,0.248565
4,038441c925bb,39989,onset,0.703679
...,...,...,...,...
9536,fe90110788d2,547145,wakeup,0.384191
9537,fe90110788d2,556577,onset,0.656204
9538,fe90110788d2,560873,wakeup,0.381279
9539,fe90110788d2,574613,onset,0.591885


---

# Scoring

In [28]:
from bisect import bisect_left
from typing import Dict, List, Tuple


class ParticipantVisibleError(Exception):
    pass


# Set some placeholders for global parameters
series_id_column_name = "series_id"
time_column_name = "step"
event_column_name = "event"
score_column_name = "score"
use_scoring_intervals = False
tolerances = {
    "onset": [12, 36, 60, 90, 120, 150, 180, 240, 300, 360],
    "wakeup": [12, 36, 60, 90, 120, 150, 180, 240, 300, 360],
}


def score(
    solution: pd.DataFrame,
    submission: pd.DataFrame,
    tolerances: Dict[str, List[float]],
    series_id_column_name: str,
    time_column_name: str,
    event_column_name: str,
    score_column_name: str,
    use_scoring_intervals: bool = False,
    verbose: bool = True,
) -> Tuple[float, pd.DataFrame, pd.DataFrame]:
    # Validate metric parameters
    assert len(tolerances) > 0, "Events must have defined tolerances."
    assert set(tolerances.keys()) == set(solution[event_column_name]).difference(
        {"start", "end"}
    ), (
        f"Solution column {event_column_name} must contain the same events "
        "as defined in tolerances."
    )
    assert pd.api.types.is_numeric_dtype(
        solution[time_column_name]
    ), f"Solution column {time_column_name} must be of numeric type."

    # Validate submission format
    for column_name in [
        series_id_column_name,
        time_column_name,
        event_column_name,
        score_column_name,
    ]:
        if column_name not in submission.columns:
            raise ParticipantVisibleError(
                f"Submission must have column '{column_name}'."
            )

    if not pd.api.types.is_numeric_dtype(submission[time_column_name]):
        raise ParticipantVisibleError(
            f"Submission column '{time_column_name}' must be of numeric type."
        )
    if not pd.api.types.is_numeric_dtype(submission[score_column_name]):
        raise ParticipantVisibleError(
            f"Submission column '{score_column_name}' must be of numeric type."
        )

    # Set these globally to avoid passing around a bunch of arguments
    globals()["series_id_column_name"] = series_id_column_name
    globals()["time_column_name"] = time_column_name
    globals()["event_column_name"] = event_column_name
    globals()["score_column_name"] = score_column_name
    globals()["use_scoring_intervals"] = use_scoring_intervals

    return event_detection_ap(solution, submission, tolerances, verbose=verbose)


def find_nearest(xs: np.ndarray, value):
    """
    Find the index of the closest value to x in the array xs.
    """
    idx = np.searchsorted(xs, value, side="left")
    best_idx = None
    best_error = float("inf")
    best_diff = float("inf")

    range_min = max(0, idx - 1)
    range_max = min(len(xs), idx + 2)
    for check_idx in range(
        range_min, range_max
    ):  # Check the exact, one before, and one after
        error = abs(xs[check_idx] - value)
        if error < best_error:
            best_error = error
            best_idx = check_idx
            best_diff = xs[check_idx] - value

    return best_idx, best_error, best_diff


def find_nearest_time_idx(sorted_gt_times, det_time, excluded_indices: set):
    """
    search index of gt_times closest to det_time.

    assumes gt_times is sorted in ascending order.
    """
    # e.g. if gt_times = [0, 1, 2, 3, 4, 5] and det_time = 2.5, then idx = 3
    sorted_gt_times = np.asarray(sorted_gt_times)
    available_indices = np.asarray(
        sorted(set(range(len(sorted_gt_times))) - excluded_indices), dtype=int
    )
    sorted_gt_times = sorted_gt_times[available_indices]
    idx, error, diff = find_nearest(sorted_gt_times, det_time)
    best_idx = available_indices[idx] if idx is not None else None

    return best_idx, error, diff


def match_detections(
    tolerance: float, ground_truths: pd.DataFrame, detections: pd.DataFrame
) -> pd.DataFrame:
    detections_sorted = detections.sort_values(
        score_column_name, ascending=False
    ).dropna()
    is_matched = np.full_like(detections_sorted[event_column_name], False, dtype=bool)
    diffs = np.full_like(
        detections_sorted[event_column_name], float("inf"), dtype=float
    )
    ground_truths_times = ground_truths.sort_values(time_column_name)[
        time_column_name
    ].to_list()
    matched_gt_indices: set[int] = set()

    for i, det in enumerate(detections_sorted.itertuples(index=False)):
        det_time = getattr(det, time_column_name)

        best_idx, best_error, best_diff = find_nearest_time_idx(
            ground_truths_times, det_time, matched_gt_indices
        )

        if (best_idx is not None) and (best_error < tolerance):
            is_matched[i] = True
            diffs[i] = best_diff
            matched_gt_indices.add(best_idx)

    detections_sorted["matched"] = is_matched
    detections_sorted["diff"] = diffs
    return detections_sorted


def precision_recall_curve(
    matches: np.ndarray, scores: np.ndarray, p: int
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    if len(matches) == 0:
        return [1], [0], []

    # Sort matches by decreasing confidence
    idxs = np.argsort(scores, kind="stable")[::-1]
    scores = scores[idxs]
    matches = matches[idxs]

    distinct_value_indices = np.where(np.diff(scores))[0]
    threshold_idxs = np.r_[distinct_value_indices, matches.size - 1]
    thresholds = scores[threshold_idxs]

    # Matches become TPs and non-matches FPs as confidence threshold decreases
    tps = np.cumsum(matches)[threshold_idxs]
    fps = np.cumsum(~matches)[threshold_idxs]

    precision = tps / (tps + fps)
    precision[np.isnan(precision)] = 0
    recall = (
        tps / p
    )  # total number of ground truths might be different than total number of matches

    # Stop when full recall attained and reverse the outputs so recall is non-increasing.
    last_ind = tps.searchsorted(tps[-1])
    sl = slice(last_ind, None, -1)

    # Final precision is 1 and final recall is 0
    return np.r_[precision[sl], 1], np.r_[recall[sl], 0], thresholds[sl]


def average_precision_score(matches: np.ndarray, scores: np.ndarray, p: int) -> float:
    precision, recall, _ = precision_recall_curve(matches, scores, p)
    # Compute step integral
    return -np.sum(np.diff(recall) * np.array(precision)[:-1])


def event_detection_ap(
    solution: pd.DataFrame,
    submission: pd.DataFrame,
    tolerances: Dict[str, List[float]] = tolerances,
    progress_bar: bool = True,
    verbose: bool = True,
) -> Tuple[float, pd.DataFrame, pd.DataFrame]:
    # Ensure solution and submission are sorted properly
    solution = solution.sort_values([series_id_column_name, time_column_name])
    submission = submission.sort_values([series_id_column_name, time_column_name])

    # Extract scoring intervals.
    if use_scoring_intervals:
        raise NotImplementedError("Scoring intervals not implemented.")

    # Extract ground-truth events.
    ground_truths = solution.query("event not in ['start', 'end']").reset_index(
        drop=True
    )

    # Map each event class to its prevalence (needed for recall calculation)
    class_counts = ground_truths.value_counts(event_column_name).to_dict()

    # Create table for detections with a column indicating a match to a ground-truth event
    detections = submission.assign(matched=False)

    # Remove detections outside of scoring intervals
    if use_scoring_intervals:
        raise NotImplementedError("Scoring intervals not implemented.")
    else:
        detections_filtered = detections

    # Create table of event-class x tolerance x series_id values
    aggregation_keys = pd.DataFrame(
        [
            (ev, tol, vid)
            for ev in tolerances.keys()
            for tol in tolerances[ev]
            for vid in ground_truths[series_id_column_name].unique()
        ],
        columns=[event_column_name, "tolerance", series_id_column_name],
    )

    # Create match evaluation groups: event-class x tolerance x series_id
    detections_grouped = aggregation_keys.merge(
        detections_filtered, on=[event_column_name, series_id_column_name], how="left"
    ).groupby([event_column_name, "tolerance", series_id_column_name])
    ground_truths_grouped = aggregation_keys.merge(
        ground_truths, on=[event_column_name, series_id_column_name], how="left"
    ).groupby([event_column_name, "tolerance", series_id_column_name])

    # Match detections to ground truth events by evaluation group
    pbars = aggregation_keys.itertuples(index=False)
    if progress_bar:
        pbars = tqdm(pbars, total=len(aggregation_keys), desc="Matching detections")
    detections_matched = []
    for key in pbars:
        dets = detections_grouped.get_group(key)
        gts = ground_truths_grouped.get_group(key)
        detections_matched.append(
            match_detections(dets["tolerance"].iloc[0], gts, dets)
        )
    detections_matched = pd.concat(detections_matched)

    # Compute AP per event x tolerance group
    event_classes = ground_truths[event_column_name].unique()
    ap_table = (
        detections_matched.query("event in @event_classes")
        .groupby([event_column_name, "tolerance"])
        .apply(
            lambda group: average_precision_score(
                group["matched"].to_numpy(),
                group[score_column_name].to_numpy(),
                class_counts[group[event_column_name].iat[0]],
            )
        )
        .reset_index()
        .pivot(index="tolerance", columns="event", values=0)
    )
    if verbose:
        display(ap_table)
    # Average over tolerances, then over event classes
    mean_ap = ap_table.mean().mean()

    return mean_ap, ap_table, detections_matched

In [32]:
df_solution = pd.read_csv(INPUT_DIR + "train_events.csv").dropna()
df_sub = pd.read_csv("submission.csv")
score_all, df_score, df_result = score(
    # solution=df_events[df_events['series_id'].isin(list(df_series['series_id'].unique()))].reset_index(),
    solution=df_solution,
    # submission=df_sub,
    submission=df_ensembled_test,
    tolerances=tolerances,
    series_id_column_name=series_id_column_name,
    time_column_name=time_column_name,
    event_column_name=event_column_name,
    score_column_name=score_column_name
)
print(score_all)

Matching detections:   0%|          | 0/5380 [00:00<?, ?it/s]

  detections_matched.query("event in @event_classes")


event,onset,wakeup
tolerance,Unnamed: 1_level_1,Unnamed: 2_level_1
12,0.530569,0.567454
36,0.738542,0.744962
60,0.812082,0.836289
90,0.855104,0.866044
120,0.873586,0.877023
150,0.880985,0.885567
180,0.888735,0.891021
240,0.895745,0.901368
300,0.899375,0.908914
360,0.903972,0.914476


0.8335906054438189


---

# Make Prediction

In [99]:
df_test = pl.read_parquet(f"{INPUT_DIR}test_series.parquet")
list_df_1min_test, df_1min_test, list_feature_array_test, dict_valid_ratio_test = data_process(df_test)

  0%|          | 0/3 [00:00<?, ?it/s]

In [100]:
X_test = np.concatenate(list_feature_array_test)
X_test = (X_test - X_test.min(axis=(0, 2), keepdims=True)) / (X_test.max(axis=(0, 2), keepdims=True) - X_test.min(axis=(0, 2), keepdims=True))

df_1min_test = pd.concat(list_df_1min_test)
df_mask_test = df_1min_test.pivot(index=["series_id", "date"], columns="time", values="valid_flag").fillna(0)
if df_mask_test.shape[1] != df_dummy_2.shape[1]:
    df_mask_test = pd.concat([df_dummy_2, df_mask_test]).fillna(0)
    
gc.collect()

145

In [102]:
preds_test1 = np.zeros_like(df_mask_test.values)
preds_test2 = np.zeros_like(df_mask_test.values)

ds_test = MyDataset(X_test, None, df_mask_test.values)  # No labels (Y) for test dataset
dl_test = DataLoader(ds_test, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True, drop_last=False)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = UNet1d(
        input_channels=X_test.shape[1],
        initial_channels=initial_channels_num,
        initial_kernel_size=15,
        down_channels=(initial_channels_num, initial_channels_num, initial_channels_num),
        down_kernel_size=(12, 15, 15),
        down_stride=(12, 9, 5),  # first element must be 12
        res_depth=3,
        res_kernel_size=15,
        se_ratio=8,
        out_kernel_size=21,
    )

# model_best.load_state_dict(torch.load(best_model_path))
model.to(device)
# model_best.eval() 

# preds_test = predict(model_best, dl_test, device)
for i in range(n_splits):
    for k in range(seed):
        with torch.no_grad():
            model.load_state_dict(torch.load(f"{MODEL_PATH}model_{k}_{i}.pth", map_location=device))
            if k == 0:
                preds_test1 += predict(model, dl_test, device) / n_splits
            else:
                preds_test2 += predict(model, dl_test, device) / n_splits

In [103]:
df_pred_test1 = make_testset(preds_test1)
df_pred_test2 = make_testset(preds_test2)

Unnamed: 0,series_id,date,time,step,score
0,038441c925bb,2018-08-14,19:30:00,5,-0.153482
1,038441c925bb,2018-08-14,19:31:00,17,-0.169492
2,038441c925bb,2018-08-14,19:32:00,29,-0.170599
3,038441c925bb,2018-08-14,19:33:00,41,-0.154519
4,038441c925bb,2018-08-14,19:34:00,53,-0.151681
5,038441c925bb,2018-08-14,19:35:00,65,-0.149816
6,038441c925bb,2018-08-14,19:36:00,77,-0.156551
7,038441c925bb,2018-08-14,19:37:00,89,-0.162371
8,038441c925bb,2018-08-14,19:38:00,101,-0.180748
9,038441c925bb,2018-08-14,19:39:00,113,-0.215714


Unnamed: 0,series_id,date,time,step,score
0,038441c925bb,2018-08-14,19:30:00,5,0.574293
1,038441c925bb,2018-08-14,19:31:00,17,0.555117
2,038441c925bb,2018-08-14,19:32:00,29,0.545039
3,038441c925bb,2018-08-14,19:33:00,41,0.538126
4,038441c925bb,2018-08-14,19:34:00,53,0.552727
5,038441c925bb,2018-08-14,19:35:00,65,0.534041
6,038441c925bb,2018-08-14,19:36:00,77,0.529779
7,038441c925bb,2018-08-14,19:37:00,89,0.533703
8,038441c925bb,2018-08-14,19:38:00,101,0.54242
9,038441c925bb,2018-08-14,19:39:00,113,0.550099


In [104]:
df_sub_test1 = sub_without_pp(df_pred_test1)
df_sub_test2 = sub_without_pp(df_pred_test2)

  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,row_id,series_id,step,event,score
0,0,038441c925bb,29,onset,0.170599


  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,row_id,series_id,step,event,score
0,0,038441c925bb,53,wakeup,0.552727
1,1,03d92c9f6f8a,53,wakeup,0.551017


In [105]:
def get_grouped_dfs_without_night(df):
    grouped_dfs = []

    for series_id, group in df.groupby('series_id'):
        # Sort by night, event (onset before wakeup), and score
        sorted_group = group.sort_values(['event', 'score'], ascending=[True, False])
        sorted_group = sorted_group.rename(columns={"step":"step_pred"})
        sorted_group['daily_step'] = sorted_group['step_pred'] % (12*60*24) / (12*60*24)
        grouped_dfs.append(sorted_group)

    return grouped_dfs

final_dfs_test1 = get_grouped_dfs_without_night(df_sub_test1)
final_dfs_test2 = get_grouped_dfs_without_night(df_sub_test2)

In [106]:
result_df_test1 = df_before(final_dfs_test1, df_pred_test1)
result_df_test2 = df_before(final_dfs_test2, df_pred_test2)

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,row_id,series_id,step_pred,event,score,daily_step,before_states_feat_12,after_states_feat_12,before_states_feat_24,after_states_feat_24,before_states_feat_60,after_states_feat_60,before_states_feat_120,after_states_feat_120,before_states_feat_240,after_states_feat_240,before_states_feat_360,after_states_feat_360,before_states_feat_720,after_states_feat_720
0,0,038441c925bb,29,onset,0.170599,0.001678,-0.169492,-0.154519,-0.161487,-0.1531,-0.161487,-0.154988,-0.161487,-0.204333,-0.161487,-0.204333,-0.161487,-0.204333,-0.161487,-0.204333


  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,row_id,series_id,step_pred,event,score,daily_step,before_states_feat_12,after_states_feat_12,before_states_feat_24,after_states_feat_24,before_states_feat_60,after_states_feat_60,before_states_feat_120,after_states_feat_120,before_states_feat_240,after_states_feat_240,before_states_feat_360,after_states_feat_360,before_states_feat_720,after_states_feat_720
0,0,038441c925bb,53,wakeup,0.552727,0.003067,0.538126,0.534041,0.541582,0.53191,0.553144,0.538008,0.553144,0.559641,0.553144,0.559641,0.553144,0.559641,0.553144,0.559641
1,1,03d92c9f6f8a,53,wakeup,0.551017,0.003067,0.534232,0.538757,0.539568,0.533631,0.547232,0.538401,0.547232,0.555804,0.547232,0.555804,0.547232,0.555804,0.547232,0.555804


In [107]:
df_final_test1 = make_features(result_df_test1)
df_final_test2 = make_features(result_df_test2)

In [109]:
df_final1.columns

Index(['event', 'score', 'correctness', 'daily_step', 'before_states_feat_12',
       'after_states_feat_12', 'before_states_feat_24', 'after_states_feat_24',
       'before_states_feat_60', 'after_states_feat_60',
       'before_states_feat_120', 'after_states_feat_120',
       'before_states_feat_240', 'after_states_feat_240',
       'before_states_feat_360', 'after_states_feat_360',
       'before_states_feat_720', 'after_states_feat_720', 'daily_step_sleep',
       'state_diff_12', 'state_diff_24', 'state_diff_60', 'state_diff_120',
       'state_diff_240', 'state_diff_360', 'state_diff_720'],
      dtype='object')

In [108]:
df_final_test1.columns

Index(['row_id', 'series_id', 'step_pred', 'event', 'score', 'daily_step',
       'before_states_feat_12', 'after_states_feat_12',
       'before_states_feat_24', 'after_states_feat_24',
       'before_states_feat_60', 'after_states_feat_60',
       'before_states_feat_120', 'after_states_feat_120',
       'before_states_feat_240', 'after_states_feat_240',
       'before_states_feat_360', 'after_states_feat_360',
       'before_states_feat_720', 'after_states_feat_720', 'daily_step_sleep',
       'state_diff_12', 'state_diff_24', 'state_diff_60', 'state_diff_120',
       'state_diff_240', 'state_diff_360', 'state_diff_720'],
      dtype='object')

In [113]:
def inference(df_final, version, n_splits=10):

    drop_cols = [
            'row_id',
            'series_id', 
            'step_pred', 
            # 'daily_step',
             ]
    
    X_test = df_final.drop(columns=drop_cols)
    y_pred_test = np.zeros_like(df_final['row_id']).astype('float32')

    X_test = X_test.astype('float32')
    X_test['event'] = X_test['event'].astype('category')

    for fold in range(n_splits):
        model = lgb.Booster(model_file=f'gbm_{version}_{fold}.bin')
        y_pred_test += model.predict(X_test, num_iteration=model.best_iteration) / n_splits

    return y_pred_test

In [114]:
y_pred_test1 = inference(df_final_test1, 1)
y_pred_test2 = inference(df_final_test2, 2)

In [118]:
def comb_with_stage1model_without_night(result_df, y_pred):
    df_final = make_features(result_df)
    df_output = df_final[['series_id', 'event', 'step_pred', 'score']].copy()
    df_output['event'] = df_output['event'].apply(lambda x: 'wakeup' if x == 1 else 'onset')
    df_output['score'] = df_output['score'] * y_pred
    df_output.rename(columns={"step_pred": "step"}, inplace=True)
    df_output['step'] = df_output['step'].astype('int64')
#     df_output.to_csv('stage2pred.csv')
    display(df_output)
    return df_output

df_test1 = comb_with_stage1model_without_night(result_df_test1, y_pred_test1)
df_test2 = comb_with_stage1model_without_night(result_df_test2, y_pred_test2)

Unnamed: 0,series_id,event,step,score
0,038441c925bb,onset,29,0.103116


Unnamed: 0,series_id,event,step,score
0,038441c925bb,wakeup,53,0.406885
1,03d92c9f6f8a,wakeup,53,0.407625


In [121]:
df_combined_test = pd.concat([df_test1, df_test2]).sort_values(by=['series_id', 'step'])

def aggregate_scores(group):
    max_score = group['score'].max()
    return pd.Series({'score': max_score})

df_ensembled_test = df_combined_test.groupby(['series_id', 'step', 'event']).apply(aggregate_scores).reset_index(drop=True).head(100000)
df_ensembled_test.to_csv('submission.csv')
df_ensembled_test

  df_ensembled_test = df_combined_test.groupby(['series_id', 'step', 'event']).apply(aggregate_scores).reset_index()


Unnamed: 0,series_id,step,event,score
0,038441c925bb,29,onset,0.103116
1,038441c925bb,53,wakeup,0.406885
2,03d92c9f6f8a,53,wakeup,0.407625
