In [1]:
import os
import pandas as pd
import numpy as np

import tensorflow as tf

from tqdm.notebook import tqdm

2024-04-01 13:36:40.988570: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-01 13:36:41.014441: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
TARGETS = ['seizure_vote','lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', "other_vote"]
PREDS = ["pred_"+col for col in TARGETS]

In [3]:
def add_score(data):
    labels = data[TARGETS].values + 1e-5
    preds = data[PREDS].values
    data['score'] = tf.keras.losses.KLDivergence(reduction='none')(labels, preds)
    return data

# make train df

In [4]:
oof_list = [
    "/kaggle/working/exp004_resnet34d/resnet34d_oof_df_exp004_resnet34d_stage1.csv",
    "/kaggle/working/exp004_effnet_b0_ns/oof_with_score.csv",
    "/kaggle/working/exp003_resnet34d/oof_with_score.csv",
    "/kaggle/working/maxvit/oof_df.csv",
    "/kaggle/working/tf2_13_models/K/oof.csv",
    "/kaggle/working/tf2_13_models/E/oof.csv",
    "/kaggle/working/tf2_13_models/R/oof.csv",
    "/kaggle/working/tf2_13_models/KE/oof.csv",
    "/kaggle/working/tf2_13_models/KR/oof.csv",
    "/kaggle/working/tf2_13_models/ER/oof.csv",
    "/kaggle/working/tf2_13_models/KER/oof.csv",
    "/kaggle/working/Resnet1d_GRU_torch/oof.csv",
]

In [5]:
pred_cols = [
    'eeg_id',
    'pred_seizure_vote',
    'pred_lpd_vote',
    'pred_gpd_vote',
    'pred_lrda_vote',
    'pred_grda_vote',
    'pred_other_vote'
]
_pred_cols = [
    'eeg_id',
    'seizure_vote_pred',
    'lpd_vote_pred',
    'gpd_vote_pred',
    'lrda_vote_pred',
    'grda_vote_pred',
    'other_vote_pred'
]
USE_COLS_LIST = [
    pred_cols,
    pred_cols,
    pred_cols,
    _pred_cols,
    pred_cols,
    pred_cols,
    pred_cols,
    pred_cols,
    pred_cols,
    pred_cols,
    pred_cols,
    pred_cols,
]

In [6]:
df = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/train.csv')
TARGETS = df.columns[-6:]
print(TARGETS)

Index(['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote',
       'other_vote'],
      dtype='object')


In [7]:
print(len(df))

106800


In [8]:
train = df.groupby('eeg_id')[['spectrogram_id','spectrogram_label_offset_seconds']].agg(
    {'spectrogram_id':'first','spectrogram_label_offset_seconds':'min'})
train.columns = ['spec_id','min']

tmp = df.groupby('eeg_id')[['spectrogram_id','spectrogram_label_offset_seconds']].agg(
    {'spectrogram_label_offset_seconds':'max'})
train['max'] = tmp

tmp = df.groupby('eeg_id')[['patient_id']].agg('first')
train['patient_id'] = tmp

tmp = df.groupby('eeg_id')[TARGETS].agg('sum')
for t in TARGETS:
    train[t] = tmp[t].values
    
y_data = train[TARGETS].values
y_data = y_data / y_data.sum(axis=1,keepdims=True)
train[TARGETS] = y_data

tmp = df.groupby('eeg_id')[['expert_consensus']].agg('first')
train['target'] = tmp

train = train.reset_index()
print('Train non-overlapp eeg_id shape:', train.shape )
train.head()

Train non-overlapp eeg_id shape: (17089, 12)


Unnamed: 0,eeg_id,spec_id,min,max,patient_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,target
0,568657,789577333,0.0,16.0,20654,0.0,0.0,0.25,0.0,0.166667,0.583333,Other
1,582999,1552638400,0.0,38.0,20230,0.0,0.857143,0.0,0.071429,0.0,0.071429,LPD
2,642382,14960202,1008.0,1032.0,5955,0.0,0.0,0.0,0.0,0.0,1.0,Other
3,751790,618728447,908.0,908.0,38549,0.0,0.0,1.0,0.0,0.0,0.0,GPD
4,778705,52296320,0.0,0.0,40955,0.0,0.0,0.0,0.0,0.0,1.0,Other


In [9]:
def filter_data_and_renamecol(df_, use_cols_, pred_idx_):
    # 使うcolだけ取り出して _{pred_idx}のsuffixをつけたcolnameにする
    rename_cols = [col+"_"+str(pred_idx_) for col in use_cols_[1:]]
    for i in range(len(rename_cols)):
        if not rename_cols[i].startswith("pred_"):
            rename_cols[i] = "pred_" + rename_cols[i][:-7] + "_" + str(pred_idx_)
    
    rename_dict = {}
    for orig_col, rename_col in zip(use_cols_[1:], rename_cols):
        rename_dict[orig_col] = rename_col
    df_ = df_.rename(columns=rename_dict)
    df_ = df_[[use_cols_[0]] + rename_cols]
    df_ = df_.groupby(use_cols_[0])[rename_cols].mean()
    df_[use_cols_[0]] = df_.index
    return df_.reset_index(drop=True)

In [10]:
def compute_oof_score(df_, oof_list, USE_COLS_LIST_):
    df_ = df_.copy()
    for idx, (oof_path, cols) in enumerate(zip(oof_list, USE_COLS_LIST_)):
        oof_ = pd.read_csv(oof_path)
        oof_ = filter_data_and_renamecol(oof_, cols, idx)
        # oof_の方がデータ数が少なくなってしまっているっぽいのでinnerにしておく
        df_ = pd.merge(df_, oof_, on="eeg_id", how="inner")
        # print(len(train))
    vote_cols = [col for col in df_.columns if col.endswith("vote")]
    for vote_col in vote_cols:
        pred_cols = [col for col in df_.columns if col.startswith("pred_"+vote_col+"_")]
        df_[f"pred_{vote_col}"] =  df_[pred_cols].mean(axis=1)
    df_ = add_score(df_)
    score = df_["score"].mean()
    
    return score, df_
        

In [11]:
oof = pd.read_csv(oof_list[-1])
use_eeg_id = oof["eeg_id"].values
train = train[train["eeg_id"].isin(use_eeg_id)]

In [12]:
score,_ = compute_oof_score(train, oof_list, USE_COLS_LIST)
print("OOF Score:", score)

OOF Score: 0.6448943234697333


2024-04-01 13:36:42.760397: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-01 13:36:42.776759: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-01 13:36:42.776870: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [13]:
score, df = compute_oof_score(train, oof_list[1:], USE_COLS_LIST[1:])
print("OOF Score:", score)

OOF Score: 0.6397006619422886


In [14]:
score, df = compute_oof_score(train, oof_list[2:], USE_COLS_LIST[2:])
print("OOF Score:", score)

OOF Score: 0.6383730678066079


In [15]:
# oof_listから一つずつ消していってみる
best_score = 100
for i in range(len(oof_list)):
    tmp_oof_list = oof_list[:i] + oof_list[i+1:]
    tmp_USE_COLS_LIST = USE_COLS_LIST[:i] + USE_COLS_LIST[i+1:]
    # print(f"Remove {oof_list[i]}")
    score, _ = compute_oof_score(train, tmp_oof_list, tmp_USE_COLS_LIST)
    if score < best_score:
        best_score = score
        best_oof_list = tmp_oof_list
        best_USE_COLS_LIST = tmp_USE_COLS_LIST

print("Best OOF Score:", best_score)
print("Best oof_list:", best_oof_list)
print("Best USE_COLS_LIST:", best_USE_COLS_LIST)

Best OOF Score: 0.6344940332130253
Best oof_list: ['/kaggle/working/exp004_resnet34d/resnet34d_oof_df_exp004_resnet34d_stage1.csv', '/kaggle/working/exp004_effnet_b0_ns/oof_with_score.csv', '/kaggle/working/exp003_resnet34d/oof_with_score.csv', '/kaggle/working/maxvit/oof_df.csv', '/kaggle/working/tf2_13_models/K/oof.csv', '/kaggle/working/tf2_13_models/E/oof.csv', '/kaggle/working/tf2_13_models/R/oof.csv', '/kaggle/working/tf2_13_models/KE/oof.csv', '/kaggle/working/tf2_13_models/KR/oof.csv', '/kaggle/working/tf2_13_models/ER/oof.csv', '/kaggle/working/tf2_13_models/KER/oof.csv']
Best USE_COLS_LIST: [['eeg_id', 'pred_seizure_vote', 'pred_lpd_vote', 'pred_gpd_vote', 'pred_lrda_vote', 'pred_grda_vote', 'pred_other_vote'], ['eeg_id', 'pred_seizure_vote', 'pred_lpd_vote', 'pred_gpd_vote', 'pred_lrda_vote', 'pred_grda_vote', 'pred_other_vote'], ['eeg_id', 'pred_seizure_vote', 'pred_lpd_vote', 'pred_gpd_vote', 'pred_lrda_vote', 'pred_grda_vote', 'pred_other_vote'], ['eeg_id', 'seizure_vote_

In [16]:
# oof_listから2つずつ消していってみる
best_score = 100
for i in range(len(oof_list)):
    for j in range(i+1, len(oof_list)):
        tmp_oof_list = oof_list[:i] + oof_list[i+1:j] + oof_list[j+1:]
        tmp_USE_COLS_LIST = USE_COLS_LIST[:i] + USE_COLS_LIST[i+1:j] + USE_COLS_LIST[j+1:]
        # print(f"Remove {oof_list[i]}, {oof_list[j]}")
        score, _ = compute_oof_score(train, tmp_oof_list, tmp_USE_COLS_LIST)
        if score < best_score:
            best_score = score
            best_oof_list = tmp_oof_list
            best_USE_COLS_LIST = tmp_USE_COLS_LIST

print("Best OOF Score:", best_score)
print("Best oof_list:", best_oof_list)
print("Best USE_COLS_LIST:", best_USE_COLS_LIST)


Best OOF Score: 0.6272298963434412
Best oof_list: ['/kaggle/working/exp004_resnet34d/resnet34d_oof_df_exp004_resnet34d_stage1.csv', '/kaggle/working/exp004_effnet_b0_ns/oof_with_score.csv', '/kaggle/working/exp003_resnet34d/oof_with_score.csv', '/kaggle/working/maxvit/oof_df.csv', '/kaggle/working/tf2_13_models/K/oof.csv', '/kaggle/working/tf2_13_models/E/oof.csv', '/kaggle/working/tf2_13_models/KE/oof.csv', '/kaggle/working/tf2_13_models/KR/oof.csv', '/kaggle/working/tf2_13_models/ER/oof.csv', '/kaggle/working/tf2_13_models/KER/oof.csv']
Best USE_COLS_LIST: [['eeg_id', 'pred_seizure_vote', 'pred_lpd_vote', 'pred_gpd_vote', 'pred_lrda_vote', 'pred_grda_vote', 'pred_other_vote'], ['eeg_id', 'pred_seizure_vote', 'pred_lpd_vote', 'pred_gpd_vote', 'pred_lrda_vote', 'pred_grda_vote', 'pred_other_vote'], ['eeg_id', 'pred_seizure_vote', 'pred_lpd_vote', 'pred_gpd_vote', 'pred_lrda_vote', 'pred_grda_vote', 'pred_other_vote'], ['eeg_id', 'seizure_vote_pred', 'lpd_vote_pred', 'gpd_vote_pred', 'l

In [17]:
# oof_listから3つずつ消していってみる
best_score = 100
for i in range(len(oof_list)):
    for j in range(i+1, len(oof_list)):
        for k in range(j+1, len(oof_list)):
            tmp_oof_list = oof_list[:i] + oof_list[i+1:j] + oof_list[j+1:k] + oof_list[k+1:]
            tmp_USE_COLS_LIST = USE_COLS_LIST[:i] + USE_COLS_LIST[i+1:j] + USE_COLS_LIST[j+1:k] + USE_COLS_LIST[k+1:]
            # print(f"Remove {oof_list[i]}, {oof_list[j]}, {oof_list[k]}")
            score,_ = compute_oof_score(train, tmp_oof_list, tmp_USE_COLS_LIST)
            if score < best_score:
                best_score = score
                best_oof_list = tmp_oof_list
                best_USE_COLS_LIST = tmp_USE_COLS_LIST

print("Best OOF Score:", best_score)
print("Best oof_list:", best_oof_list)
print("Best USE_COLS_LIST:", best_USE_COLS_LIST)


Best OOF Score: 0.6186082200625801
Best oof_list: ['/kaggle/working/exp004_resnet34d/resnet34d_oof_df_exp004_resnet34d_stage1.csv', '/kaggle/working/exp004_effnet_b0_ns/oof_with_score.csv', '/kaggle/working/maxvit/oof_df.csv', '/kaggle/working/tf2_13_models/K/oof.csv', '/kaggle/working/tf2_13_models/E/oof.csv', '/kaggle/working/tf2_13_models/KE/oof.csv', '/kaggle/working/tf2_13_models/KR/oof.csv', '/kaggle/working/tf2_13_models/ER/oof.csv', '/kaggle/working/tf2_13_models/KER/oof.csv']
Best USE_COLS_LIST: [['eeg_id', 'pred_seizure_vote', 'pred_lpd_vote', 'pred_gpd_vote', 'pred_lrda_vote', 'pred_grda_vote', 'pred_other_vote'], ['eeg_id', 'pred_seizure_vote', 'pred_lpd_vote', 'pred_gpd_vote', 'pred_lrda_vote', 'pred_grda_vote', 'pred_other_vote'], ['eeg_id', 'seizure_vote_pred', 'lpd_vote_pred', 'gpd_vote_pred', 'lrda_vote_pred', 'grda_vote_pred', 'other_vote_pred'], ['eeg_id', 'pred_seizure_vote', 'pred_lpd_vote', 'pred_gpd_vote', 'pred_lrda_vote', 'pred_grda_vote', 'pred_other_vote'], [

In [18]:
# oof_listから4つずつ消していってみる
best_score = 100
for i in range(len(oof_list)):
    for j in range(i+1, len(oof_list)):
        for k in range(j+1, len(oof_list)):
            for l in range(k+1, len(oof_list)):
                tmp_oof_list = oof_list[:i] + oof_list[i+1:j] + oof_list[j+1:k] + oof_list[k+1:l] + oof_list[l+1:]
                tmp_USE_COLS_LIST = USE_COLS_LIST[:i] + USE_COLS_LIST[i+1:j] + USE_COLS_LIST[j+1:k] + USE_COLS_LIST[k+1:l] + USE_COLS_LIST[l+1:]
                # print(f"Remove {oof_list[i]}, {oof_list[j]}, {oof_list[k]}, {oof_list[l]}")
                score, df = compute_oof_score(train, tmp_oof_list, tmp_USE_COLS_LIST)
                if score < best_score:
                    best_df = df
                    best_score = score
                    best_oof_list = tmp_oof_list
                    best_USE_COLS_LIST = tmp_USE_COLS_LIST

print("Best OOF Score:", best_score)
print("Best oof_list:", best_oof_list)
print("Best USE_COLS_LIST:", best_USE_COLS_LIST)

Best OOF Score: 0.611145849281353
Best oof_list: ['/kaggle/working/exp004_effnet_b0_ns/oof_with_score.csv', '/kaggle/working/maxvit/oof_df.csv', '/kaggle/working/tf2_13_models/K/oof.csv', '/kaggle/working/tf2_13_models/E/oof.csv', '/kaggle/working/tf2_13_models/KE/oof.csv', '/kaggle/working/tf2_13_models/KR/oof.csv', '/kaggle/working/tf2_13_models/ER/oof.csv', '/kaggle/working/tf2_13_models/KER/oof.csv']
Best USE_COLS_LIST: [['eeg_id', 'pred_seizure_vote', 'pred_lpd_vote', 'pred_gpd_vote', 'pred_lrda_vote', 'pred_grda_vote', 'pred_other_vote'], ['eeg_id', 'seizure_vote_pred', 'lpd_vote_pred', 'gpd_vote_pred', 'lrda_vote_pred', 'grda_vote_pred', 'other_vote_pred'], ['eeg_id', 'pred_seizure_vote', 'pred_lpd_vote', 'pred_gpd_vote', 'pred_lrda_vote', 'pred_grda_vote', 'pred_other_vote'], ['eeg_id', 'pred_seizure_vote', 'pred_lpd_vote', 'pred_gpd_vote', 'pred_lrda_vote', 'pred_grda_vote', 'pred_other_vote'], ['eeg_id', 'pred_seizure_vote', 'pred_lpd_vote', 'pred_gpd_vote', 'pred_lrda_vote',

In [19]:
best_df.columns

Index(['eeg_id', 'spec_id', 'min', 'max', 'patient_id', 'seizure_vote',
       'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote',
       'target', 'pred_seizure_vote_0', 'pred_lpd_vote_0', 'pred_gpd_vote_0',
       'pred_lrda_vote_0', 'pred_grda_vote_0', 'pred_other_vote_0',
       'pred_seizure_vote_1', 'pred_lpd_vote_1', 'pred_gpd_vote_1',
       'pred_lrda_vote_1', 'pred_grda_vote_1', 'pred_other_vote_1',
       'pred_seizure_vote_2', 'pred_lpd_vote_2', 'pred_gpd_vote_2',
       'pred_lrda_vote_2', 'pred_grda_vote_2', 'pred_other_vote_2',
       'pred_seizure_vote_3', 'pred_lpd_vote_3', 'pred_gpd_vote_3',
       'pred_lrda_vote_3', 'pred_grda_vote_3', 'pred_other_vote_3',
       'pred_seizure_vote_4', 'pred_lpd_vote_4', 'pred_gpd_vote_4',
       'pred_lrda_vote_4', 'pred_grda_vote_4', 'pred_other_vote_4',
       'pred_seizure_vote_5', 'pred_lpd_vote_5', 'pred_gpd_vote_5',
       'pred_lrda_vote_5', 'pred_grda_vote_5', 'pred_other_vote_5',
       'pred_seizure_vote_6', '

In [20]:
best_oof_list
ens7_dir = "/kaggle/working/mean_ens8"
os.makedirs(ens7_dir, exist_ok=True)
import yaml
with open(os.path.join(ens7_dir, "best_oof_list.yaml"), "w") as f:
    yaml.dump(best_oof_list, f)
best_df.to_csv(os.path.join(ens7_dir, "oof.csv"), index=False)