In [1]:
import pandas as pd
import numpy as np

In [2]:
train_csv = pd.read_csv('data/train.csv')

In [3]:
def read_parquet_cache(path):
    cache = {}

    def read_parquet(id_):
        if id_ not in cache:
            cache[id_] = pd.read_parquet(f'{path}{id_}.parquet')
        return cache[id_]

    return read_parquet

read_eeg = read_parquet_cache(path='data/train_eegs/')
read_spg = read_parquet_cache(path='data/train_spectrograms/')

In [4]:
def eeg_window(row):
    eeg_data = read_eeg(row.eeg_id)
    eeg_offset = int(row.eeg_label_offset_seconds)
    eeg = eeg_data.iloc[(200 * eeg_offset):(200 * (eeg_offset + 50))]
    return eeg

eeg_window(train_csv.iloc[0])


Unnamed: 0,Fp1,F3,C3,P3,F7,T3,T5,O1,Fz,Cz,Pz,Fp2,F4,C4,P4,F8,T4,T6,O2,EKG
0,-80.519997,-70.540001,-80.110001,-108.750000,-120.330002,-88.620003,-101.750000,-104.489998,-99.129997,-90.389999,-97.040001,-77.989998,-88.830002,-112.120003,-108.110001,-95.949997,-98.360001,-121.730003,-106.449997,7.920000
1,-80.449997,-70.330002,-81.760002,-107.669998,-120.769997,-90.820000,-104.260002,-99.730003,-99.070000,-92.290001,-96.019997,-84.500000,-84.989998,-115.610001,-103.860001,-97.470001,-89.290001,-115.500000,-102.059998,29.219999
2,-80.209999,-75.870003,-82.050003,-106.010002,-117.500000,-87.489998,-99.589996,-96.820000,-119.680000,-99.360001,-91.110001,-99.440002,-104.589996,-127.529999,-113.349998,-95.870003,-96.019997,-123.879997,-105.790001,45.740002
3,-84.709999,-75.339996,-87.480003,-108.970001,-121.410004,-94.750000,-105.370003,-100.279999,-113.839996,-102.059998,-95.040001,-99.230003,-101.220001,-125.769997,-111.889999,-97.459999,-97.180000,-128.940002,-109.889999,83.870003
4,-90.570000,-80.790001,-93.000000,-113.870003,-129.960007,-102.860001,-118.599998,-101.099998,-107.660004,-102.339996,-98.510002,-95.300003,-88.930000,-115.639999,-99.800003,-97.500000,-88.730003,-114.849998,-100.250000,97.769997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-140.039993,-128.100006,-137.339996,-160.830002,-153.630005,-136.279999,-137.009995,-93.349998,-145.130005,-155.830002,-124.650002,-123.250000,-127.709999,-169.759995,-68.489998,-117.669998,-69.239998,-115.309998,-123.860001,65.010002
9996,-152.169998,-161.449997,-173.210007,-165.320007,-143.570007,-124.150002,-127.339996,-87.309998,-160.919998,-158.360001,-121.870003,-129.550003,-121.470001,-120.339996,-68.029999,-135.130005,-105.190002,-114.330002,-121.029999,47.090000
9997,-149.619995,-147.479996,-171.960007,-152.589996,-137.279999,-105.550003,-122.220001,-80.010002,-156.039993,-155.119995,-116.360001,-118.099998,-113.690002,-102.760002,-67.839996,-120.410004,-109.099998,-116.419998,-119.099998,95.589996
9998,-126.860001,-122.889999,-125.879997,-130.339996,-134.779999,-134.350006,-127.080002,-76.739998,-137.649994,-146.800003,-111.720001,-114.199997,-106.739998,-104.699997,-60.240002,-154.119995,-129.639999,-110.029999,-116.239998,72.980003


In [5]:
def eeg_features(eeg_df, w=1):
    features = []
    # divide the 50s (sampled at 200Hz) window into 5 10s windows, index of the centre (3rd) window = 200*2 = 400 to 600 (excl.)
    for i in range(-w, w + 1):
        df = pd.Series({
            f'eeg_mean_{i}': eeg_df.iloc[(400 + 200 * i):(600 + 200 * i)].mean(axis=0).mean(),
            f'eeg_mean_std_{i}': eeg_df.iloc[(400 + 200 * i):(600 + 200 * i)].std(axis=0).mean(),
            f'eeg_std_mean_{i}': eeg_df.iloc[(400 + 200 * i):(600 + 200 * i)].mean(axis=0).std()
            })
        features.append(df) 
    return pd.concat(features, axis=0)

eeg_window(train_csv.iloc[0]).std(axis=0).mean()

eeg_features(eeg_window(train_csv.iloc[0]))

train_csv.iloc[:10].apply(lambda row: eeg_features(eeg_window(row)), axis=1)

Unnamed: 0,eeg_mean_-1,eeg_mean_std_-1,eeg_std_mean_-1,eeg_mean_0,eeg_mean_std_0,eeg_std_mean_0,eeg_mean_1,eeg_mean_std_1,eeg_std_mean_1
0,-111.673264,24.752644,30.932142,-110.901047,26.921082,32.833191,-113.018753,28.422199,32.130127
1,-111.179222,26.531916,35.970078,-111.270424,26.446014,36.496841,-115.460556,26.137089,34.409584
2,-115.460556,26.137089,34.409584,-111.240891,27.798437,40.685734,-116.828354,29.770533,36.523613
3,-121.981613,26.758234,34.817699,-115.51339,32.930592,32.930702,-102.599953,34.888706,33.390369
4,-118.106567,29.09519,35.155155,-120.9655,30.123852,34.405281,-121.977867,27.075817,35.265491
5,-121.977867,27.075817,35.265491,-122.25132,31.300879,37.335011,-127.809402,32.655102,41.595894
6,-126.687355,28.048321,61.280548,-133.956451,32.766975,69.830673,-143.119812,32.014721,81.598465
7,-119.745033,39.799778,84.57457,-156.01004,31.789234,94.364983,-147.543518,30.12258,91.485283
8,-135.717651,32.044655,57.741669,-129.270309,32.429352,43.455433,-121.483292,36.11961,39.232327
9,-2.485212,17.081455,16.419262,-10.463892,16.607861,15.901236,-0.305507,13.909231,15.75701


In [6]:
def spg_window(row):
    spg_data = read_spg(row.spectrogram_id)
    spg_offset = int(row.spectrogram_label_offset_seconds)
    spg = spg_data.loc[(spg_data.time >= spg_offset) & (spg_data.time < spg_offset + 600)]
    return spg

spg_window(train_csv.iloc[0]).iloc[142:147]

Unnamed: 0,time,LL_0.59,LL_0.78,LL_0.98,LL_1.17,LL_1.37,LL_1.56,LL_1.76,LL_1.95,LL_2.15,...,RP_18.16,RP_18.36,RP_18.55,RP_18.75,RP_18.95,RP_19.14,RP_19.34,RP_19.53,RP_19.73,RP_19.92
142,285,4.86,5.32,6.45,6.56,5.19,5.16,9.11,14.18,10.6,...,0.22,0.16,0.14,0.14,0.12,0.1,0.15,0.19,0.27,0.46
143,287,7.61,10.67,14.49,13.66,11.09,9.34,13.84,13.98,14.08,...,0.13,0.19,0.2,0.18,0.21,0.18,0.15,0.15,0.29,0.44
144,289,5.66,9.55,16.73,16.299999,13.72,19.07,11.3,13.48,15.11,...,0.23,0.24,0.19,0.17,0.17,0.16,0.23,0.27,0.27,0.23
145,291,4.16,3.9,5.5,7.18,10.09,16.67,16.450001,19.700001,20.91,...,0.14,0.14,0.14,0.14,0.16,0.25,0.28,0.28,0.28,0.19
146,293,3.39,4.59,5.64,7.58,9.07,13.0,22.280001,30.65,29.84,...,0.21,0.17,0.15,0.08,0.11,0.11,0.11,0.12,0.08,0.08


In [7]:
def spg_features(spg_df, w=1):
    features = []
    # divide the 600s window into 10s windows from the centre, and 2 5s windows on either side, the centre one (295s to 305s) has index (295-1)/2 = 147 to 152 (excl.)
    for i in range(-w, w + 1):
        df = pd.Series({
            f'spg_mean_{i}': spg_df.iloc[(147 + 5 * i):(152 + 5 * i)].mean(axis=0).mean(),
            f'spg_mean_std_{i}': spg_df.iloc[(147 + 5 * i):(152 + 5 * i)].std(axis=0).mean(),
            f'spg_std_mean_{i}': spg_df.iloc[(147 + 5 * i):(152 + 5 * i)].mean(axis=0).std()
            })
        features.append(df) 
    return pd.concat(features, axis=0)

spg_window(train_csv.iloc[0]).std(axis=0).mean()

spg_features(spg_window(train_csv.iloc[0]))

train_csv.iloc[:10].apply(lambda row: spg_features(spg_window(row)), axis=1)

Unnamed: 0,spg_mean_-1,spg_mean_std_-1,spg_std_mean_-1,spg_mean_0,spg_mean_std_0,spg_std_mean_0,spg_mean_1,spg_mean_std_1,spg_std_mean_1
0,3.128883,0.888897,14.769795,3.483766,1.047539,15.347544,4.242703,1.42516,15.808733
1,3.181736,0.837062,15.160834,3.989815,1.250328,15.60737,4.402469,1.297377,16.061071
2,3.32401,0.907132,15.294543,4.11618,1.403306,15.694504,4.482364,1.403214,16.192349
3,4.11618,1.403306,15.694504,4.482364,1.403214,16.192349,5.700943,1.935652,17.074218
4,4.32415,1.264081,15.958554,5.053601,1.547541,16.559143,6.318838,1.876163,18.278206
5,4.402469,1.297377,16.061071,5.382195,1.738183,16.814866,5.952409,2.005453,18.200139
6,4.547252,1.456065,16.282562,6.103187,2.174612,17.411815,4.666863,2.127757,17.528306
7,5.382195,1.738183,16.814866,5.952409,2.005453,18.200139,3.011766,0.726677,17.323988
8,6.103187,2.174612,17.411815,4.666863,2.127757,17.528306,2.86393,0.61583,17.558466
9,1.890913,0.697422,14.535472,2.233207,0.738296,15.163721,1.714623,0.367071,15.485652


In [8]:
df = train_csv.copy()
TARGETS = df.columns[-6:]
TARGETS

Index(['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote',
       'other_vote'],
      dtype='object')

In [9]:
features_eeg = train_csv.apply(lambda row: eeg_features(eeg_window(row)), axis=1)

In [12]:
features_spg = train_csv.apply(lambda row: spg_features(spg_window(row)), axis=1)

In [13]:
features_spg

Unnamed: 0,spg_mean_-1,spg_mean_std_-1,spg_std_mean_-1,spg_mean_0,spg_mean_std_0,spg_std_mean_0,spg_mean_1,spg_mean_std_1,spg_std_mean_1
0,3.128883,0.888897,14.769795,3.483766,1.047539,15.347544,4.242703,1.425160,15.808733
1,3.181736,0.837062,15.160834,3.989815,1.250328,15.607370,4.402469,1.297377,16.061071
2,3.324010,0.907132,15.294543,4.116180,1.403306,15.694504,4.482364,1.403214,16.192349
3,4.116180,1.403306,15.694504,4.482364,1.403214,16.192349,5.700943,1.935652,17.074218
4,4.324150,1.264081,15.958554,5.053601,1.547541,16.559143,6.318838,1.876163,18.278206
...,...,...,...,...,...,...,...,...,...
106795,5.197501,1.638036,19.424642,3.471840,1.674629,17.026009,4.271746,1.255300,19.574242
106796,4.363421,1.863198,18.050685,3.776319,1.735778,17.662034,4.250095,1.304274,19.167526
106797,3.959481,1.890669,17.547557,4.012214,1.614320,18.290083,4.504234,1.301983,19.534570
106798,3.899496,1.917008,17.433734,4.012579,1.207262,18.716346,4.531237,1.359104,19.463298


In [14]:
features_eeg

Unnamed: 0,eeg_mean_-1,eeg_mean_std_-1,eeg_std_mean_-1,eeg_mean_0,eeg_mean_std_0,eeg_std_mean_0,eeg_mean_1,eeg_mean_std_1,eeg_std_mean_1
0,-111.673264,24.752644,30.932142,-110.901047,26.921082,32.833191,-113.018753,28.422199,32.130127
1,-111.179222,26.531916,35.970078,-111.270424,26.446014,36.496841,-115.460556,26.137089,34.409584
2,-115.460556,26.137089,34.409584,-111.240891,27.798437,40.685734,-116.828354,29.770533,36.523613
3,-121.981613,26.758234,34.817699,-115.513390,32.930592,32.930702,-102.599953,34.888706,33.390369
4,-118.106567,29.095190,35.155155,-120.965500,30.123852,34.405281,-121.977867,27.075817,35.265491
...,...,...,...,...,...,...,...,...,...
106795,-17.185930,28.779255,13.632392,-11.539765,28.420013,15.939910,-24.230206,22.281134,19.149290
106796,-24.230206,22.281134,19.149290,-25.722342,30.708096,37.985241,-18.629768,26.952051,44.562115
106797,-18.629768,26.952051,44.562115,-16.443876,23.052977,52.295197,-16.423147,17.287750,27.157236
106798,-16.423147,17.287750,27.157236,-9.044989,22.287127,21.402754,-7.984838,30.258774,15.032234
