In [347]:
import pandas as pd
import numpy as np

REMOVE_DUPLICATE_EEG_IDS = True # if True, each row in training corresponds to a unique eeg_id
MAX_ROWS = None # use only MAX_ROWS rows of train.csv, set to None to use all rows
USE_CACHE = True # stores all accessed eeg and spectrogram files in memory
DATA_DIR = 'data/' # /kaggle/input/hms-harmful-brain-activity-classification/

EEG_N_WINDOWS_ONE_SIDE = 2 # number of eeg 2s windows either side of the centre window to include for features
SPG_N_WINDOWS_ONE_SIDE = 12 # number of spectrogram 10s windows either side of the centre window to include for features

In [348]:
train_csv = pd.read_csv(f'{DATA_DIR}train.csv')
train_csv

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,3,0,0,0,0,0
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,3,0,0,0,0,0
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,3,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106795,351917269,6,12.0,2147388374,6,12.0,4195677307,10351,LRDA,0,0,0,3,0,0
106796,351917269,7,14.0,2147388374,7,14.0,290896675,10351,LRDA,0,0,0,3,0,0
106797,351917269,8,16.0,2147388374,8,16.0,461435451,10351,LRDA,0,0,0,3,0,0
106798,351917269,9,18.0,2147388374,9,18.0,3786213131,10351,LRDA,0,0,0,3,0,0


In [349]:
def read_parquet_cache(path):
    cache = {}

    def read_parquet(id_):
        if id_ in cache:
            return cache[id_]
        
        df = pd.read_parquet(f'{path}{id_}.parquet')
        
        if USE_CACHE:
            cache[id_] = df
        
        return df

    return read_parquet

read_eeg = read_parquet_cache(path=f'{DATA_DIR}train_eegs/')
read_eeg_test = read_parquet_cache(path=f'{DATA_DIR}test_eegs/')
read_spg = read_parquet_cache(path=f'{DATA_DIR}train_spectrograms/')
read_spg_test = read_parquet_cache(path=f'{DATA_DIR}test_spectrograms/')

In [350]:
def eeg_window(row, train=True):
    eeg_data = read_eeg(row.eeg_id) if train else read_eeg_test(row.eeg_id)
    if train:
        eeg_offset = int(row.eeg_label_offset_seconds)
        eeg_data = eeg_data.iloc[(200 * eeg_offset):(200 * (eeg_offset + 50))]
    return eeg_data

eeg_window(train_csv.iloc[0])

Unnamed: 0,Fp1,F3,C3,P3,F7,T3,T5,O1,Fz,Cz,Pz,Fp2,F4,C4,P4,F8,T4,T6,O2,EKG
0,-80.519997,-70.540001,-80.110001,-108.750000,-120.330002,-88.620003,-101.750000,-104.489998,-99.129997,-90.389999,-97.040001,-77.989998,-88.830002,-112.120003,-108.110001,-95.949997,-98.360001,-121.730003,-106.449997,7.920000
1,-80.449997,-70.330002,-81.760002,-107.669998,-120.769997,-90.820000,-104.260002,-99.730003,-99.070000,-92.290001,-96.019997,-84.500000,-84.989998,-115.610001,-103.860001,-97.470001,-89.290001,-115.500000,-102.059998,29.219999
2,-80.209999,-75.870003,-82.050003,-106.010002,-117.500000,-87.489998,-99.589996,-96.820000,-119.680000,-99.360001,-91.110001,-99.440002,-104.589996,-127.529999,-113.349998,-95.870003,-96.019997,-123.879997,-105.790001,45.740002
3,-84.709999,-75.339996,-87.480003,-108.970001,-121.410004,-94.750000,-105.370003,-100.279999,-113.839996,-102.059998,-95.040001,-99.230003,-101.220001,-125.769997,-111.889999,-97.459999,-97.180000,-128.940002,-109.889999,83.870003
4,-90.570000,-80.790001,-93.000000,-113.870003,-129.960007,-102.860001,-118.599998,-101.099998,-107.660004,-102.339996,-98.510002,-95.300003,-88.930000,-115.639999,-99.800003,-97.500000,-88.730003,-114.849998,-100.250000,97.769997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-140.039993,-128.100006,-137.339996,-160.830002,-153.630005,-136.279999,-137.009995,-93.349998,-145.130005,-155.830002,-124.650002,-123.250000,-127.709999,-169.759995,-68.489998,-117.669998,-69.239998,-115.309998,-123.860001,65.010002
9996,-152.169998,-161.449997,-173.210007,-165.320007,-143.570007,-124.150002,-127.339996,-87.309998,-160.919998,-158.360001,-121.870003,-129.550003,-121.470001,-120.339996,-68.029999,-135.130005,-105.190002,-114.330002,-121.029999,47.090000
9997,-149.619995,-147.479996,-171.960007,-152.589996,-137.279999,-105.550003,-122.220001,-80.010002,-156.039993,-155.119995,-116.360001,-118.099998,-113.690002,-102.760002,-67.839996,-120.410004,-109.099998,-116.419998,-119.099998,95.589996
9998,-126.860001,-122.889999,-125.879997,-130.339996,-134.779999,-134.350006,-127.080002,-76.739998,-137.649994,-146.800003,-111.720001,-114.199997,-106.739998,-104.699997,-60.240002,-154.119995,-129.639999,-110.029999,-116.239998,72.980003


In [351]:
def spg_window(row, train=True):
    spg_data = read_spg(row.spectrogram_id) if train else read_spg_test(row.spectrogram_id)
    if train:
        spg_offset = int(row.spectrogram_label_offset_seconds)
        spg_data = spg_data.loc[(spg_data.time >= spg_offset) & (spg_data.time < spg_offset + 600)]
        spg_data = spg_data.drop(columns=['time'])
    return spg_data

spg_window(train_csv.iloc[0])

Unnamed: 0,LL_0.59,LL_0.78,LL_0.98,LL_1.17,LL_1.37,LL_1.56,LL_1.76,LL_1.95,LL_2.15,LL_2.34,...,RP_18.16,RP_18.36,RP_18.55,RP_18.75,RP_18.95,RP_19.14,RP_19.34,RP_19.53,RP_19.73,RP_19.92
0,4.26,10.98,9.05,13.65,11.49,8.930000,18.840000,19.260000,19.240000,19.049999,...,0.31,0.17,0.28,0.19,0.24,0.27,0.29,0.16,0.22,0.19
1,2.65,3.97,12.18,13.26,14.21,13.230000,9.650000,8.110000,11.280000,8.460000,...,0.15,0.13,0.14,0.24,0.24,0.36,0.35,0.31,0.36,0.40
2,4.18,4.53,8.77,14.26,13.36,16.559999,19.219999,17.510000,22.650000,21.719999,...,0.29,0.21,0.16,0.25,0.28,0.28,0.34,0.48,0.44,0.48
3,2.41,3.21,4.92,8.07,5.97,12.420000,10.820000,14.960000,21.809999,19.629999,...,0.33,0.51,0.49,0.64,0.58,0.42,0.32,0.31,0.32,0.33
4,2.29,2.44,2.77,4.62,5.39,7.080000,9.840000,12.270000,14.410000,13.310000,...,0.44,0.38,0.48,0.63,0.45,0.45,0.49,0.33,0.31,0.34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,5.26,7.41,10.16,7.25,15.35,27.590000,28.650000,27.600000,30.690001,23.330000,...,0.09,0.17,0.11,0.09,0.10,0.08,0.07,0.17,0.17,0.24
296,5.17,7.17,10.05,13.47,13.22,11.560000,29.790001,24.660000,23.379999,36.750000,...,0.06,0.06,0.05,0.11,0.09,0.08,0.16,0.21,0.20,0.20
297,1.87,3.59,8.69,7.59,8.33,11.880000,12.690000,16.360001,24.240000,21.320000,...,0.09,0.07,0.07,0.06,0.06,0.07,0.08,0.10,0.10,0.10
298,3.27,5.63,9.46,11.19,11.98,12.260000,9.240000,7.030000,8.660000,10.030000,...,0.12,0.08,0.06,0.04,0.04,0.06,0.05,0.08,0.13,0.14


In [352]:
def eeg_features(eeg_df, w=EEG_N_WINDOWS_ONE_SIDE):
    features = []
    # divide the 50s (sampled at 200Hz) window into 25 2s windows, index of the centre window = 200 * [24, 26) = [4800, 5200)
    for i in range(-w, w + 1):
        df = eeg_df.iloc[(200 * 24 + 200 * 2 * i):(200 * 26 + 200 * 2 * i)].mean(axis=0)
        df.index = [f'{label}_mean_{i}' for label in df.index]
        features.append(df) 

        df = eeg_df.iloc[(200 * 24 + 200 * 2 * i):(200 * 26 + 200 * 2 * i)].std(axis=0)
        df.index = [f'{label}_std_{i}' for label in df.index]
        features.append(df) 
    return pd.concat(features, axis=0)

# eeg_window(train_csv.iloc[0]).std(axis=0).mean()

# eeg_features(eeg_window(train_csv.iloc[0]))

train_csv.iloc[:10].apply(lambda row: eeg_features(eeg_window(row)), axis=1)

Unnamed: 0,Fp1_mean_-2,F3_mean_-2,C3_mean_-2,P3_mean_-2,F7_mean_-2,T3_mean_-2,T5_mean_-2,O1_mean_-2,Fz_mean_-2,Cz_mean_-2,...,Pz_std_2,Fp2_std_2,F4_std_2,C4_std_2,P4_std_2,F8_std_2,T4_std_2,T6_std_2,O2_std_2,EKG_std_2
0,-114.860626,-121.613144,-117.031502,-137.160873,-150.611877,-116.682854,-121.580544,-110.812302,-137.58815,-109.400467,...,14.951877,22.316343,22.683603,22.232683,32.444248,22.302801,22.093912,27.045029,13.803229,231.495605
1,-124.78997,-128.530777,-124.422325,-144.376724,-146.996246,-117.753754,-121.646423,-121.176346,-150.30603,-121.48822,...,20.735647,20.328373,21.922972,31.281889,30.577042,20.813448,26.091412,29.060429,17.67716,238.315872
2,-138.860855,-137.650894,-128.543121,-142.137253,-172.945358,-124.960548,-121.755898,-128.502472,-177.72406,-119.055023,...,26.971539,32.649151,31.474195,37.40432,36.594986,39.555389,41.489029,35.804131,20.939175,239.636017
3,-110.566093,-113.342903,-124.976776,-119.479599,-133.750626,-103.037201,-101.66201,-228.684174,-371.182983,-228.159409,...,22.461561,17.02224,22.417004,40.580009,33.502922,24.148991,34.186035,29.671082,17.740688,202.350723
4,-118.08535,-122.419724,-123.111824,-136.947495,-146.886948,-115.970428,-119.665604,-95.974823,-173.791519,-161.563812,...,27.893204,21.192364,28.690672,50.693546,46.582077,37.80481,44.325966,46.667892,24.360353,232.867432
5,-122.991234,-129.26976,-127.502068,-140.06073,-147.217239,-119.519676,-125.444725,-83.230721,-157.848785,-173.960236,...,25.378784,19.208479,26.697153,49.081486,44.128517,34.447887,41.780956,37.503708,18.73674,231.296234
6,-131.246597,-122.5532,-121.024315,-133.524323,-151.301346,-109.660477,-117.053528,-82.1026,-136.837219,-135.393448,...,44.632759,35.531155,41.475536,62.20237,63.239834,46.53693,55.260597,67.307465,37.041935,230.972458
7,-130.718277,-129.105652,-121.625114,-133.819641,-170.89064,-121.209579,-123.381859,-123.984001,-111.2463,-102.487595,...,16.46781,15.608613,18.036119,26.260429,17.209486,18.466576,25.113413,16.127319,10.779423,224.56749
8,-106.810722,-111.240723,-115.090225,-131.892761,-131.827576,-105.025215,-113.079025,-156.329575,-119.718971,-137.05365,...,15.854019,19.242168,19.947412,22.938761,21.415215,21.181063,20.312042,14.163277,11.219995,183.879532
9,-7.635725,-4.606551,13.930301,-15.2945,-5.6269,-5.801,10.403025,5.917075,-13.0811,-7.60825,...,20.644638,14.120595,17.766943,16.522057,19.860462,10.717187,4.156302,12.357162,18.290411,20.765163


In [353]:
def spg_features(spg_df, w=SPG_N_WINDOWS_ONE_SIDE):
    features = []
    # divide the 600s window into 10s windows from the centre, and 2 5s windows on either side, the centre one (295s to 305s) has index (295-1)/2 = 147 to 152 (excl.)
    for i in range(-w, w + 1):
        df = spg_df.iloc[(147 + 5 * i):(152 + 5 * i)].mean(axis=0)
        df.index = [f'{label}_mean_{i}' for label in df.index]
        features.append(df) 

        df = spg_df.iloc[(147 + 5 * i):(152 + 5 * i)].std(axis=0)
        df.index = [f'{label}_std_{i}' for label in df.index]
        features.append(df) 
    return pd.concat(features, axis=0)

# spg_window(train_csv.iloc[0]).std(axis=0).mean()

# spg_features(spg_window(train_csv.iloc[0]))

train_csv.iloc[256:259].apply(lambda row: spg_features(spg_window(row)), axis=1)

Unnamed: 0,LL_0.59_mean_-7,LL_0.78_mean_-7,LL_0.98_mean_-7,LL_1.17_mean_-7,LL_1.37_mean_-7,LL_1.56_mean_-7,LL_1.76_mean_-7,LL_1.95_mean_-7,LL_2.15_mean_-7,LL_2.34_mean_-7,...,RP_18.16_std_7,RP_18.36_std_7,RP_18.55_std_7,RP_18.75_std_7,RP_18.95_std_7,RP_19.14_std_7,RP_19.34_std_7,RP_19.53_std_7,RP_19.73_std_7,RP_19.92_std_7
256,4.76,6.114,6.632,5.67,4.866,3.75,3.842,3.942,3.656,3.016,...,,,,,,,,,,
257,3.682,4.374,4.956,4.738,3.742,3.388,3.68,3.168,3.066,2.598,...,,,,,,,,,,
258,7.812,8.058001,8.218,8.936,6.814,6.172,8.084,6.966001,4.956,5.046,...,,,,,,,,,,


In [354]:
def actual_median(s):
    return s.iloc[(s - s.median()).abs().argsort().iloc[0]]

actual_median(pd.Series([1])), actual_median(pd.Series([1, 2])), actual_median(pd.Series([1, 2, 3])), actual_median(pd.Series([1, 2, 3, 4]))

(1, 1, 2, 2)

In [355]:
# remove duplicate eeg_ids (keeping the median one only) if needed

df = train_csv.copy()

if REMOVE_DUPLICATE_EEG_IDS:
    df = df.groupby('eeg_id')[['eeg_label_offset_seconds']].agg(actual_median)
    df = pd.merge(df, train_csv, on=['eeg_id', 'eeg_label_offset_seconds'], how='left')

df = df[:MAX_ROWS]

df

Unnamed: 0,eeg_id,eeg_label_offset_seconds,eeg_sub_id,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,568657,6.0,1,789577333,1,6.0,3640441665,20654,Other,0,0,3,0,2,7
1,582999,18.0,5,1552638400,5,18.0,1179854295,20230,LPD,0,12,0,1,0,1
2,642382,0.0,0,14960202,12,1008.0,3254468733,5955,Other,0,0,0,0,0,1
3,751790,0.0,0,618728447,4,908.0,2898467035,38549,GPD,0,0,1,0,0,0
4,778705,0.0,0,52296320,0,0.0,3255875127,40955,Other,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17084,4293354003,0.0,0,1188113564,0,0.0,447244163,16610,GRDA,0,0,0,0,1,1
17085,4293843368,0.0,0,1549502620,0,0.0,1618953053,15065,GRDA,0,0,0,0,1,1
17086,4294455489,0.0,0,2105480289,0,0.0,469526364,56,Other,0,0,0,0,0,1
17087,4294858825,6.0,2,657299228,2,6.0,3251917981,4312,Other,0,0,0,0,1,14


In [356]:
features_eeg = df.apply(lambda row: eeg_features(eeg_window(row)), axis=1)
features_eeg

Unnamed: 0,Fp1_mean_-2,F3_mean_-2,C3_mean_-2,P3_mean_-2,F7_mean_-2,T3_mean_-2,T5_mean_-2,O1_mean_-2,Fz_mean_-2,Cz_mean_-2,...,Pz_std_2,Fp2_std_2,F4_std_2,C4_std_2,P4_std_2,F8_std_2,T4_std_2,T6_std_2,O2_std_2,EKG_std_2
0,-73.854027,-1.728700,144.434036,-25.413301,-18.031723,-28.765825,-10.894226,-18.076099,-5.437474,-36.723221,...,74.884781,91.934479,53.087006,77.816811,110.290321,99.482513,95.636093,119.104546,116.240273,9418.666992
1,-3.493696,-29.176023,-7.040400,26.835125,24.191700,-4.013700,-13.547976,-18.697199,-55.870079,-22.811600,...,11.780252,17.654436,28.109316,18.843225,12.703863,16.620022,13.148725,11.891325,9.517156,2.238693
2,-13.543775,2.372251,-47.358677,-31.328402,-60.792355,-56.496449,-36.451473,-44.314476,15.214651,-15.368151,...,10.139777,10.077752,8.104139,156.827667,13.439871,14.483626,16.492222,16.787252,15.088309,33.315449
3,-32.065952,-13.970822,-12.332577,10.543626,-10.057575,-9.895999,4.735624,24.470274,-23.366899,-21.487822,...,378.045471,406.943665,392.753601,392.372437,376.078461,398.408417,388.072968,390.694519,387.138733,148.977493
4,7.742300,-7.633700,-31.650200,-30.811451,4.684925,-0.308875,-4.260775,-10.933350,-42.531898,-18.318550,...,17.878086,21.493322,14.082626,14.893942,307.694702,22.590963,15.570377,14.989574,23.382496,31.030487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17084,-13.770024,-8.796950,-33.893478,-31.631075,-2.526400,-4.985125,-25.107176,-27.427576,-20.561001,-5.620326,...,32.227070,31.204332,24.187428,31.038691,32.634686,24.459702,18.553791,21.848574,19.983114,4.129588
17085,-9.054299,-35.540752,-22.332726,-24.079023,-3.059224,-31.881399,-59.180321,-29.287451,8.935724,-19.296925,...,7.247148,35.588142,29.628181,17.599436,7.385067,43.452301,35.447056,18.858658,12.773644,14.933559
17086,173.419708,63.280849,85.413780,117.927849,54.683502,103.066139,50.002777,60.319725,21.543425,55.623348,...,0.449947,0.449947,0.449947,0.449947,0.449947,0.449947,0.449947,0.449947,0.449947,0.000000
17087,-396.274231,-385.460785,-334.679291,-318.182861,-383.265381,-332.675354,-356.971832,-342.311768,-357.940308,-368.694702,...,12.347421,18.602482,18.045822,12.980250,14.925300,16.864647,15.018119,16.858309,13.179996,6.044393


In [357]:
features_spg = df.apply(lambda row: spg_features(spg_window(row)), axis=1)
features_spg

Unnamed: 0,LL_0.59_mean_-7,LL_0.78_mean_-7,LL_0.98_mean_-7,LL_1.17_mean_-7,LL_1.37_mean_-7,LL_1.56_mean_-7,LL_1.76_mean_-7,LL_1.95_mean_-7,LL_2.15_mean_-7,LL_2.34_mean_-7,...,RP_18.16_std_7,RP_18.36_std_7,RP_18.55_std_7,RP_18.75_std_7,RP_18.95_std_7,RP_19.14_std_7,RP_19.34_std_7,RP_19.53_std_7,RP_19.73_std_7,RP_19.92_std_7
0,154.832001,225.154007,249.571991,237.652008,219.024002,165.465988,136.715988,117.981995,97.919998,84.723999,...,3.836212,3.726657,4.060532,3.676740,3.378073,3.085989,2.414897,2.023260,1.461848,1.388110
1,24.684000,25.831997,24.944002,19.796000,10.514000,7.092000,5.414001,4.514000,3.692000,3.210000,...,0.020736,0.017889,0.008367,0.011402,0.008367,0.014832,0.013038,0.014142,0.019494,0.017889
2,12.550000,13.966001,13.474001,8.948000,7.248000,5.604000,3.268000,1.848000,0.888000,0.836000,...,0.053572,0.042190,0.056833,0.040373,0.056391,0.066558,0.049497,0.045056,0.048477,0.037815
3,25.257999,46.706001,49.849998,52.349998,58.990002,51.954002,52.465996,56.773998,45.588001,38.599998,...,0.059414,0.058052,0.040620,0.024083,0.025495,0.017889,0.021213,0.044497,0.053385,0.040249
4,13.245999,17.567999,24.161999,25.290001,26.382000,23.378000,20.143999,15.216001,12.432000,13.340001,...,0.114018,0.079183,0.068920,0.059749,0.046043,0.052726,0.062048,0.055408,0.098894,0.129923
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17084,54.689995,56.772003,52.354004,28.667999,17.448000,10.040000,9.728000,7.950000,5.200000,4.432000,...,0.005477,0.004472,0.004472,0.004472,0.000000,0.000000,0.005477,0.007071,0.008367,0.005477
17085,13.797999,18.594000,21.806000,19.484001,17.747999,11.898000,10.408000,9.226000,8.286000,6.482000,...,0.080436,0.122801,0.077910,0.098387,0.196647,0.208758,0.255969,0.156461,0.100150,0.271514
17086,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,
17087,,,,,,,,,,,...,0.052440,0.101833,0.149499,0.135647,0.117686,0.115974,0.124780,0.112472,0.099649,0.149265


In [358]:
from sklearn.preprocessing import StandardScaler

data_processed = df.copy()

col_features = list(features_eeg.columns) + list(features_spg.columns)
col_targets = list(df.columns[-6:])

y = data_processed[col_targets]
y = y.div(y.sum(axis=1), axis=0)

data_processed[col_targets] = y

data_processed = pd.concat([data_processed, features_eeg, features_spg], axis=1)
data_processed = data_processed.dropna()
data_processed = data_processed.reset_index()

data_processed['sample_weight'] = 1.0 / data_processed.groupby('eeg_id')['eeg_sub_id'].transform('count')

# std_scaler = StandardScaler()
# data_processed[col_features] = std_scaler.fit_transform(data_processed[col_features])

data_processed

Unnamed: 0,index,eeg_id,eeg_label_offset_seconds,eeg_sub_id,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,...,RP_18.36_std_7,RP_18.55_std_7,RP_18.75_std_7,RP_18.95_std_7,RP_19.14_std_7,RP_19.34_std_7,RP_19.53_std_7,RP_19.73_std_7,RP_19.92_std_7,sample_weight
0,0,568657,6.0,1,789577333,1,6.0,3640441665,20654,Other,...,3.726657,4.060532,3.676740,3.378073,3.085989,2.414897,2.023260,1.461848,1.388110,1.0
1,1,582999,18.0,5,1552638400,5,18.0,1179854295,20230,LPD,...,0.017889,0.008367,0.011402,0.008367,0.014832,0.013038,0.014142,0.019494,0.017889,1.0
2,2,642382,0.0,0,14960202,12,1008.0,3254468733,5955,Other,...,0.042190,0.056833,0.040373,0.056391,0.066558,0.049497,0.045056,0.048477,0.037815,1.0
3,3,751790,0.0,0,618728447,4,908.0,2898467035,38549,GPD,...,0.058052,0.040620,0.024083,0.025495,0.017889,0.021213,0.044497,0.053385,0.040249,1.0
4,4,778705,0.0,0,52296320,0,0.0,3255875127,40955,Other,...,0.079183,0.068920,0.059749,0.046043,0.052726,0.062048,0.055408,0.098894,0.129923,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16675,17082,4293144208,0.0,0,1224582295,1,234.0,3884397953,64445,Other,...,0.756274,0.701762,1.055192,0.947323,1.913000,1.717111,1.255946,1.611946,0.992033,1.0
16676,17083,4293306306,0.0,0,819682076,3,168.0,1974235411,37409,GPD,...,0.018166,0.036469,0.024495,0.016733,0.020494,0.010954,0.016432,0.016432,0.013038,1.0
16677,17084,4293354003,0.0,0,1188113564,0,0.0,447244163,16610,GRDA,...,0.004472,0.004472,0.004472,0.000000,0.000000,0.005477,0.007071,0.008367,0.005477,1.0
16678,17085,4293843368,0.0,0,1549502620,0,0.0,1618953053,15065,GRDA,...,0.122801,0.077910,0.098387,0.196647,0.208758,0.255969,0.156461,0.100150,0.271514,1.0


In [359]:
from kaggle_kl_div import score
import catboost as cat
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import GroupKFold

class_ids = {'Seizure':0, 'LPD':1, 'GPD':2, 'LRDA':3, 'GRDA':4, 'Other':5}

group_k_fold = GroupKFold(n_splits=5)

all_oof = []
all_true = []

for i, (train_ids, valid_ids) in enumerate(group_k_fold.split(data_processed, None, data_processed.patient_id)):
    model = CatBoostClassifier(task_type='GPU', loss_function='MultiClass')
    
    train_pool = Pool(
        data=data_processed.loc[train_ids, col_features],
        label=data_processed.loc[train_ids, 'expert_consensus'].map(class_ids),
        weight=data_processed.loc[train_ids, 'sample_weight']
    )

    valid_pool = Pool(
        data=data_processed.loc[valid_ids, col_features],
        label=data_processed.loc[valid_ids, 'expert_consensus'].map(class_ids),
        weight=data_processed.loc[valid_ids, 'sample_weight']
    )

    model.fit(train_pool, verbose=100, eval_set=valid_pool)
    model.save_model(f'model_f{i}.cat')

    oof = model.predict_proba(valid_pool)
    all_oof.append(oof)
    all_true.append(data_processed.loc[valid_ids, col_targets])


all_oof = np.concatenate(all_oof)
all_true = np.concatenate(all_true)

df_oof = pd.DataFrame(all_oof.copy())
df_oof['id'] = np.arange(len(df_oof))

df_true = pd.DataFrame(all_true.copy())
df_true['id'] = np.arange(len(df_true))

score(solution=df_true, submission=df_oof, row_id_column_name='id')

Learning rate set to 0.136611
0:	learn: 1.6541843	test: 1.6726404	best: 1.6726404 (0)	total: 136ms	remaining: 2m 15s
100:	learn: 0.7691358	test: 1.1901760	best: 1.1901760 (100)	total: 12.4s	remaining: 1m 49s
200:	learn: 0.5850732	test: 1.1654504	best: 1.1654504 (200)	total: 24.6s	remaining: 1m 37s
300:	learn: 0.4637946	test: 1.1621104	best: 1.1600162 (268)	total: 36.6s	remaining: 1m 24s
400:	learn: 0.3797158	test: 1.1558011	best: 1.1553808 (398)	total: 48.5s	remaining: 1m 12s
500:	learn: 0.3181997	test: 1.1565784	best: 1.1542126 (455)	total: 1m	remaining: 1m
600:	learn: 0.2695362	test: 1.1661854	best: 1.1542126 (455)	total: 1m 12s	remaining: 47.9s
700:	learn: 0.2283554	test: 1.1610065	best: 1.1542126 (455)	total: 1m 23s	remaining: 35.8s
800:	learn: 0.1972842	test: 1.1687932	best: 1.1542126 (455)	total: 1m 35s	remaining: 23.7s
900:	learn: 0.1691160	test: 1.1737544	best: 1.1542126 (455)	total: 1m 47s	remaining: 11.8s
999:	learn: 0.1478131	test: 1.1833621	best: 1.1542126 (455)	total: 1m 5

0.8777164787692541

In [360]:
from kaggle_kl_div import score

df_equal = pd.DataFrame(np.ones(df_oof.shape) / 6)
df_equal['id'] = np.arange(len(df_equal))

df_true = pd.DataFrame(all_true.copy())
df_true['id'] = np.arange(len(df_true))

score(solution=df_true, submission=df_equal, row_id_column_name='id')

1.4741674289276896

In [361]:
test_csv = pd.read_csv(f'{DATA_DIR}test.csv')
test_csv

Unnamed: 0,spectrogram_id,eeg_id,patient_id
0,853520,3911565283,6885


In [362]:
eeg_window(test_csv.iloc[0], train=False)

Unnamed: 0,Fp1,F3,C3,P3,F7,T3,T5,O1,Fz,Cz,Pz,Fp2,F4,C4,P4,F8,T4,T6,O2,EKG
0,9.210000,-47.459999,15.100000,8.220000,-16.900000,-22.99,-25.820000,-10.090000,28.370001,-3.010000,-27.299999,101.040001,35.110001,14.540000,18.330000,28.540001,44.090000,69.650002,30.74,171.679993
1,-3.590000,-30.290001,32.380001,10.800000,-68.980003,-21.60,-15.080000,-9.210000,26.360001,-8.980000,-32.279999,95.800003,26.389999,4.820000,10.540000,20.559999,32.060001,59.439999,23.32,178.279999
2,-26.040001,-60.070000,2.370000,-10.150000,-34.689999,-31.40,-31.920000,-26.980000,-1.940000,-28.770000,-49.770000,73.449997,-3.680000,-17.320000,-16.150000,-8.270000,5.330000,45.180000,9.49,306.739990
3,-3.040000,-36.250000,29.559999,14.530000,-14.010000,-11.90,-14.230000,-6.310000,26.040001,-2.770000,-25.030001,91.010002,22.610001,6.900000,9.930000,15.480000,33.580002,69.620003,31.01,223.259995
4,-4.630000,-20.160000,25.190001,1.190000,-44.580002,-23.51,-30.709999,-17.600000,25.420000,-8.860000,-33.959999,89.449997,19.440001,-2.080000,6.110000,8.380000,24.180000,55.869999,19.91,170.759995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-26.889999,-45.480000,-17.250000,-23.570000,19.059999,-9.40,-27.120001,-21.580000,-75.760002,-65.800003,-88.790001,-30.090000,-49.830002,-75.339996,-61.139999,-71.889999,-53.299999,-8.130000,-12.38,-34.799999
9996,-24.049999,-41.689999,-13.450000,-26.219999,14.210000,0.02,-30.030001,-22.219999,-75.440002,-68.639999,-91.099998,-33.180000,-45.610001,-78.809998,-61.259998,-71.889999,-55.009998,-12.320000,-15.15,-27.799999
9997,-34.500000,-55.340000,-25.959999,-30.670000,8.890000,-9.74,-38.520000,-30.330000,-87.080002,-70.690002,-92.320000,-37.349998,-57.290001,-80.209999,-67.320000,-72.919998,-57.110001,-12.330000,-15.20,21.980000
9998,-16.110001,-35.980000,-8.570000,-12.020000,28.580000,5.45,-20.510000,-10.300000,-65.459999,-50.730000,-71.650002,-15.970000,-36.380001,-59.660000,-46.310001,-51.520000,-39.740002,6.770000,3.74,-5.800000


In [363]:
spg_window(test_csv.iloc[0], train=False)

Unnamed: 0,time,LL_0.59,LL_0.78,LL_0.98,LL_1.17,LL_1.37,LL_1.56,LL_1.76,LL_1.95,LL_2.15,...,RP_18.16,RP_18.36,RP_18.55,RP_18.75,RP_18.95,RP_19.14,RP_19.34,RP_19.53,RP_19.73,RP_19.92
0,1,14.910000,17.110001,11.660000,11.73,6.08,4.54,4.31,3.38,2.05,...,0.07,0.06,0.05,0.06,0.05,0.05,0.06,0.05,0.04,0.05
1,3,11.130000,10.950000,10.770000,5.07,4.03,3.24,3.61,2.98,1.54,...,0.05,0.04,0.04,0.04,0.04,0.04,0.03,0.03,0.03,0.02
2,5,10.880000,10.570000,8.790000,5.33,2.44,1.48,1.83,0.99,0.89,...,0.04,0.04,0.04,0.03,0.03,0.04,0.04,0.05,0.06,0.06
3,7,19.450001,18.200001,17.719999,13.38,4.17,1.88,1.84,1.22,1.27,...,0.03,0.03,0.05,0.08,0.07,0.07,0.08,0.03,0.03,0.03
4,9,21.650000,22.530001,23.160000,17.00,7.19,3.89,3.65,2.72,2.35,...,0.04,0.04,0.05,0.05,0.06,0.05,0.05,0.05,0.04,0.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,591,15.580000,18.209999,14.020000,15.96,4.36,4.98,2.68,2.22,2.03,...,0.48,0.59,0.59,0.73,0.44,0.41,0.56,0.60,0.61,0.60
296,593,17.209999,20.219999,20.889999,17.16,9.15,4.14,2.49,2.71,1.60,...,0.26,0.37,0.41,0.36,0.48,0.36,0.39,0.46,0.34,0.32
297,595,9.610000,13.320000,9.190000,11.50,8.11,5.53,5.57,3.69,3.19,...,0.58,0.37,0.17,0.14,0.13,0.30,0.36,0.39,0.56,0.29
298,597,8.430000,11.840000,13.640000,10.56,8.63,5.80,2.98,1.48,0.96,...,0.54,0.22,0.17,0.16,0.11,0.38,0.45,0.45,0.45,0.34


In [364]:
features_eeg = test_csv.apply(lambda row: eeg_features(eeg_window(row, train=False)), axis=1)
features_spg = test_csv.apply(lambda row: spg_features(spg_window(row, train=False)), axis=1)

preds = []
for i in range(5):
    model = CatBoostClassifier(task_type='GPU')
    model.load_model(f'model_f{i}.cat')

    test_pool = Pool(
        data = pd.concat([features_eeg, features_spg], axis=1)
    )

    pred = model.predict_proba(test_pool)
    preds.append(pred)


pred = np.mean(preds, axis=0)
pred.round(3)

array([[0.068, 0.032, 0.001, 0.088, 0.129, 0.682]])

In [365]:
submission_csv = pd.DataFrame({'eeg_id': test_csv.eeg_id.values})
submission_csv[col_targets] = pred
submission_csv

Unnamed: 0,eeg_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,3911565283,0.067663,0.031732,0.001196,0.088319,0.128725,0.682365


In [366]:
submission_csv.to_csv('submission.csv', index=False)

In [367]:
pd.read_csv('submission.csv')

Unnamed: 0,eeg_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,3911565283,0.067663,0.031732,0.001196,0.088319,0.128725,0.682365
