Changes from 2.1

- even simpler architecture: just regular classification, without use of sequences

Results
- training stagnates at a loss ~ 0.40
- result is good for training, but does not generalize to test data nor other subject
- TODO add dropout
- TODO observed that volatility of HandStart=1 is much lower than HandStart=0 ... use that?

## check gpu usage

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

## some parameters

In [None]:
# n_train: number of points for training, as opposed to testing
# lahead: stride data with "lahead" window size
# batch_size: keras.model.fit parameter .. smaller batches lead to less loss of data when truncating non-multiples of batch_size
# downsample_pts: 1 for no downsampling, 10 for downsample by 10
#---------------------------------------------------------
# set 1
# n_train, lahead, batch_size, downsample_pts = 120000, 10, 2**14, 10
n_train, lahead, batch_size, downsample_pts = 120000, 10, 32, 10

# set 2
# n_train, lahead, batch_size, downsample_pts = 1200000, 100, (2**10)*(2**8), 1 # batch_size = 1024
# n_train, lahead, batch_size, downsample_pts = 1200000, 100, 2**8, 1 # batch_size = 256

# note that lahead=150 matches perfectly with non-downsampled length of HandStart=1 length
# note smaller batch_size since non-overlap causes smaller number of HanStart=1 samples
# n_train, lahead, batch_size, downsample_pts = 1200000, 150, 2**5, 1 # batch_size = 32

# set 3:
# training each subject / series separately
# Requires smaller batch_size since each series is only around 1000 pts when downsampled by 10
# n_train, lahead, batch_size, downsample_pts = 120000, 10, 2**4, 10

# print
n_train, lahead, batch_size, downsample_pts

## import libraries

In [None]:
from matplotlib import pyplot as plt
import pandas as pd
import time

# https://keras.io/layers/recurrent/#lstm
from keras.models import Sequential, Model
from keras.layers import (Dense, LSTM, Lambda, Dropout, Embedding, Flatten,
                         Subtract, Dot, Activation,
                         Input, RepeatVector, TimeDistributed, Concatenate,
                         Conv1D, MaxPooling1D, AveragePooling1D
                         )

import numpy as np

from sklearn.preprocessing import MinMaxScaler

## load data

In [None]:
def my_load(subj_ids:list, series_ids:list):
    features_all = []
    targets_all = []
    for i1 in subj_ids:
        for i2 in series_ids:
            for i3, fn in [
                ('features', 'data/raw/train/subj%i_series%i_data.csv'%(i1, i2)),
                ('targets', 'data/raw/train/subj%i_series%i_events.csv'%(i1, i2)),
            ]:
                print('status', i1, i2, i3)
                xxx_i = pd.read_csv(fn)
                xxx_i['subj_id'] = i1
                xxx_i['series_id'] = i2
                xxx_i = xxx_i.set_index(['subj_id', 'series_id', 'id']).astype('int16')
                xxx_i = xxx_i[::downsample_pts] # downsample
                if i3=='features':
                    features_all.append(xxx_i)
                else:
                    targets_all.append(xxx_i)
            
    features_all = pd.concat(features_all, axis=0)
    targets_all = pd.concat(targets_all, axis=0)
    return features_all, targets_all

In [None]:
train_features, train_targets = my_load(subj_ids = [1], series_ids = [x+1 for x in range(8)])
train_features.shape, train_targets.shape

In [None]:
train_features.head(n=2)

In [None]:
train_targets.head(n=2)

## split out training

In [None]:
x_train = train_features.head(n=n_train).copy()
y_train = train_targets.head(n=n_train).copy()
print('x_train, y_train', x_train.shape, y_train.shape)

In [None]:
# calculate ratio of HandStart = 0 to = 1 to get the target class imbalance
ratio_0_1 = y_train.groupby('HandStart').size()
print(ratio_0_1)
ratio_0_1 = ratio_0_1.loc[0] // ratio_0_1.loc[1]
ratio_0_1

## Identify each HandStart=1

In [None]:
# calculate length of HandStart == 1
y_cols = y_train.columns
for k in y_cols: # e.g. 'HandStart'
    y_temp1 = y_train[k].diff().fillna(value=0)
    y_temp2 = y_temp1.copy()
    y_temp2[y_temp2 < 0] = 0
    y_temp2 = y_temp2.cumsum()
    y_train['%s_id'%k] = y_train[k] * y_temp2

y_train[[x for x in y_train.columns if x.endswith('_id')]].head(n=10000).plot()
plt.show()

In [None]:
# all HandStarts are of length 150
assert set(y_train[y_train['HandStart_id']>0].groupby('HandStart_id').size()) == set([150 // downsample_pts])

In [None]:
set(y_train[y_train['LiftOff_id']>0].groupby('LiftOff_id').size())

In [None]:
y_train.shape

## preprocess features

e.g. scale to [0,1], stride, truncate, etc

In [None]:
def my_truncate(df):
    """
    drop 1st x rows if they are not a multiple of batch_size
    """
    return df.tail(df.shape[0] - (df.shape[0]%batch_size))

def wrap_pd_df(xxx, func):
    return pd.DataFrame(
             func(xxx), 
             columns=xxx.columns, 
             index=xxx.index
           )

def my_repeat(ztrain_roll_1):
    z_np = np.repeat(ztrain_roll_1.values, repeats=ratio_0_1, axis=0)
    z_cols = ztrain_roll_1.columns
    z_inds = ztrain_roll_1.index
    print('z shape', ztrain_roll_1.shape, z_np.shape, z_cols.shape, z_inds.shape)
    ztrain_roll_1 = pd.DataFrame(
        z_np,
        columns = z_cols,
        index = z_inds.repeat(ratio_0_1)
    )
    return ztrain_roll_1

In [None]:
def preprocess(x_train, y_train, do_balance):
    
    #---------------------------
    # scale
    scaler = MinMaxScaler()

    print('min/max start')
    # xtrain_pre = x_train.groupby(['subj_id', 'series_id']).apply(lambda xxx: scaler.fit_transform(xxx))
    xtrain_pre = ( x_train.groupby(['subj_id', 'series_id'])
                          .apply(lambda xxx: wrap_pd_df(xxx, lambda yyy: scaler.fit_transform(yyy)))
                 )
    ytrain_pre = y_train # just a copy since no scaling done

    print('train_pre', xtrain_pre.shape, ytrain_pre.shape)

    #---------------------------
    # balance classes
    if do_balance:

        # Create separate non-overlapping windows from HandStart=1 and =0, and then concatenate
        # This way, we don't get a window half of which has HandStart=0 and the other half = 1
        xtrain_roll_1, ytrain_roll_1 = (xtrain_pre[ytrain_pre['HandStart']==1], ytrain_pre[ytrain_pre['HandStart']==1])
        xtrain_roll_0, ytrain_roll_0 = (xtrain_pre[ytrain_pre['HandStart']==0], ytrain_pre[ytrain_pre['HandStart']==0])

        # repeat the *_1 40x times to balance against the *_0 class (check above for how to calculate 40)
        xtrain_roll_1 = my_repeat(xtrain_roll_1)
        ytrain_roll_1 = my_repeat(ytrain_roll_1)

        # concatenate _0 with _1
        xtrain_roll = pd.concat([xtrain_roll_1, xtrain_roll_0])
        ytrain_roll = pd.concat([ytrain_roll_1, ytrain_roll_0])

        assert xtrain_roll.shape[0] > 0
        print('shape after balance', xtrain_roll.shape, ytrain_roll.shape)
    else:
        xtrain_roll = xtrain_pre
        ytrain_roll = ytrain_pre

    #---------------------------
    # drop non-batchsize-multiple, once for all
    to_drop = xtrain_roll.shape[0] % batch_size
    print('drop non-multiple of batch_size', to_drop)
    xtrain_roll = my_truncate(xtrain_roll)
    ytrain_roll = my_truncate(ytrain_roll)
    print('train_roll 2', xtrain_roll.shape, ytrain_roll.shape)

    assert xtrain_roll.shape[0]>0, "lost all data ... batch_size=%s is too high"%batch_size
    
    return xtrain_roll, ytrain_roll

In [None]:
xtrain_roll, ytrain_roll = preprocess(x_train.copy(), y_train.copy(), True)
xtrain_roll.shape, ytrain_roll.shape

## verify

In [None]:
x_train.head(n=2)

In [None]:
x_train[['Fp1', 'Fp2']].plot(figsize=(20,3), alpha=0.5)
plt.show()

In [None]:
y_train[['HandStart']].head(n=10000).plot(figsize=(20,3), alpha=0.5)
plt.show()

In [None]:
xtrain_roll[['Fp1', 'Fp2']].plot(figsize=(20,3), alpha=0.5)
# plt.title('subj_id=1, series_id=1')
plt.show()

In [None]:
# Plot below should have a balanced number of points for 1 and 0
# if the 1's are repeated enough

ytrain_roll['HandStart'].plot(figsize=(20,3), alpha=0.5)
# plt.title('subj_id=1, series_id=1')
plt.show()

## shuffle the data

In [None]:
new_ind = np.arange(xtrain_roll.shape[0])
np.random.shuffle(new_ind)
xtrain_roll = xtrain_roll.iloc[new_ind]
ytrain_roll = ytrain_roll.iloc[new_ind]
xtrain_roll.shape, ytrain_roll.shape

In [None]:
xtrain_roll[['Fp1', 'Fp2']].plot(figsize=(20,3), alpha=0.5)
# plt.title('subj_id=1, series_id=1')
plt.show()

In [None]:
# Plot below should have a balanced number of points for 1 and 0
# if the 1's are repeated enough

ytrain_roll['HandStart'].plot(figsize=(20,3), alpha=0.5)
# plt.title('subj_id=1, series_id=1')
plt.show()

## fit model: simple classifier

In [None]:
def create_coupled():
    lstm_dim_1 = 15
    len_feat = xtrain_roll.shape[-1]
    len_targ = 1
    input_shape = (len_feat, )

    # input
    feat_raw = Input(shape=input_shape, name='raw_features')
    
    # downsample
    feat_conv = feat_raw
    # feat_conv = AveragePooling1D(pool_size = 10)(feat_conv)

    # features encoder
    feat_enc = feat_conv
    feat_enc = Dense(
              lstm_dim_1,
              batch_size=batch_size,
              activation='relu',#'tanh',
              name='intermediate')(feat_enc)

    targ_rec = feat_enc
    targ_rec = Dense(len_targ, activation='sigmoid', name='reconstructed_targets')(targ_rec)
    targ_rec = Dropout(0.2)(targ_rec)
    #targ_rec = Activation('sigmoid')(targ_rec)

    # create model
    model_all = Model(inputs = [feat_raw], outputs = [targ_rec])
    return model_all

In [None]:
from keras.losses import binary_crossentropy


mod2 = create_coupled()
mod2.compile(loss='binary_crossentropy', optimizer='adam')
mod2.summary()

In [None]:
def my_predict(model, np_in, index):
    
    # make prediction
    targ_rec = model.predict(np_in, batch_size=batch_size)
        
    # plot target reconstruction
    feat_int = 0
    pd.DataFrame({
        'actual': pd.Series(np_in['raw_targets'][:,feat_int],  index=index).astype('int16'),
        'pred': pd.Series(targ_rec[:,feat_int],                index=index),
    }).plot(figsize=(20,3), alpha=0.5)
    plt.title('target %i'%(feat_int))
    plt.legend()
    plt.show()
    
    # prepare output
    out = pd.DataFrame({
        'prediction': targ_rec[:,feat_int].squeeze(), 
        'id': index,
    }).set_index(['id'])
    return out

In [None]:
print(time.ctime(),'fit start')
history = mod2.fit(
         {   'raw_features': xtrain_roll,
         },
         {   'reconstructed_targets': ytrain_roll.iloc[:,:1],
         },
         batch_size=batch_size,
         epochs=100,
         # initial_epoch = 17,
         verbose=2,
         #validation_data=None,
         validation_split = 0.3,
         shuffle=False
    )
print(time.ctime(),'fit end')

In [None]:
# ignore first few points since large relative to others
# plt.plot(history.history['loss'][5:], label='loss')
plt.plot(history.history['loss'], label='loss') # [5:]
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend()
plt.title('training loss')
plt.show()

In [None]:
# predict on whole series (plots implicitly actual vs predicted)
#n_show = 100
#n_start = 0 # ,  50, 100, 150]:
#ytrain_pred = my_predict(
#    mod2,
#    {   'raw_features': xtrain_roll.iloc[n_start:(n_start+n_show)],
#        'raw_targets':  ytrain_roll.iloc[n_start:(n_start+n_show)] + 1,
#    },
#    ytrain_roll.index[:n_show],
#)
ytrain_pred = my_predict(
    mod2,
    {   'raw_features': xtrain_roll.values[:100],
        'raw_targets':  ytrain_roll.values[:100] + 1,
    },
    ytrain_roll.index[:100],
)
# ytrain_pred.shape

## plot trained result

In [None]:
# re-build data without balancing and shuffling and with overlap
n_show = 100000
xtrain_ori, ytrain_ori = preprocess(x_train.head(n=n_show), y_train.head(n=n_show), False)
print(xtrain_ori.shape, ytrain_ori.shape)

In [None]:
xtrain_ori[['Fp1', 'Fp2']].plot(figsize=(20,3), alpha=0.5)
# plt.title('subj_id=1, series_id=1')
plt.show()

In [None]:
ytrain_ori['HandStart'].plot(figsize=(20,3), alpha=0.5)
# plt.title('subj_id=1, series_id=1')
plt.show()

In [None]:
for i in range(3):
    ytrain_pred = my_predict(
        mod2,
        {   'raw_features': xtrain_ori.values[i*1000:(i+1)*1000],
            'raw_targets':  ytrain_ori.values[i*1000:(i+1)*1000] + 1,
        },
        ytrain_ori.index[i*1000:(i+1)*1000],
    )
    # ytrain_pred.shape

## predict on test data

In [None]:
n_test = train_features.shape[0] - n_train
x_test = train_features.tail(n=n_test).copy()
y_test = train_targets.tail(n=n_test).copy()
print('x_test, y_test', x_test.shape, y_test.shape)

xtest_roll, ytest_roll = preprocess(x_test, y_test, False)
xtest_roll.shape, ytest_roll.shape

In [None]:
for i in range(3):
    ytest_pred = my_predict(
        mod2,
        {   'raw_features': xtest_roll.values[i*1000:(i+1)*1000],
            'raw_targets':  ytest_roll.values[i*1000:(i+1)*1000] + 1,
        },
        ytest_roll.index[i*1000:(i+1)*1000],
    )
    #ytest_pred.shape

## predict on new subject

In [None]:
subj2_features, subj2_targets = my_load(subj_ids = [2], series_ids = [x+1 for x in range(8)])
subj2_features.shape, subj2_targets.shape

In [None]:
x_subj2 = subj2_features.copy()
y_subj2 = subj2_targets.copy()
print('x_subj2, y_subj2', x_subj2.shape, y_subj2.shape)

xsubj2_roll, ysubj2_roll = preprocess(x_subj2, y_subj2, False)
assert xsubj2_roll.shape[0] > 0
xsubj2_roll.shape, ysubj2_roll.shape

In [None]:
n_step = 1000*2
for i in range(3):
    ytest_pred = my_predict(
        mod2,
        {   'raw_features': xsubj2_roll.values[i*n_step:(i+1)*n_step],
            'raw_targets':  ysubj2_roll.values[i*n_step:(i+1)*n_step] + 1,
        },
        ysubj2_roll.index[i*n_step:(i+1)*n_step],
    )
    #ytest_pred.shape