In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from collections import defaultdict
import random
from tqdm import tqdm
import pickle
import copy
from scipy import signal
import glob

CATEGORIES = ['StartHesitation','Turn','Walking'] # 3 classes
FEATNAMES = ['AccV','AccML','AccAP']
mlb = MultiLabelBinarizer()
lenc = LabelEncoder()
mlb.fit([CATEGORIES])
lenc.fit(CATEGORIES)

SOURCE_DIR = '/kaggle/input/'
SAVE_DIR = '/kaggle/working/'
DATASOURCE = 'realworld' # options: lab | realworld | lab_and_realworld
TASK = 'binary' # options: binary | multiclass
SAMPLES = 150 # lab = 192, realworld = 150
jump = 30 # lab = 38 (80% overlap), realworld = 30

#gpus = tf.config.list_physical_devices('GPU')
#tf.config.experimental.set_memory_growth(gpus[0], True)

In [2]:
#%%
# populate dataframe only with data associated with labels (0, 1, 2) for the 3 classes
def get_multiclass_df(FILENAMES,metadata_df,DATASOURCE):
    df = pd.DataFrame()
    for filename in tqdm(FILENAMES):
        data = pd.read_csv(filename)
        if DATASOURCE == 'realworld': # only consider rows with unambiguous annotations
            data = data[(data['Valid']==True) & (data['Task']==True)]
            data.reset_index(inplace=True,drop=True)
        nevents = data[CATEGORIES].sum().sum()
        if nevents == 0: # skip if subject has no event whatsoever
            continue
        labels = mlb.inverse_transform(np.array(data[CATEGORIES])) # returns class tuple
        eventId = filename.split('/')[-1].split('.csv')[0]
        subjectId = metadata_df[metadata_df['Id']==eventId]['Subject'].item()
        subjectId = eventId
        
        for category in CATEGORIES:
            condition = data[category]==1
            category_df = data[FEATNAMES][condition]
            category_df['label'] = lenc.transform(pd.DataFrame(labels)[condition].iloc[:,0])
            category_df['subject'] = subjectId
            category_df['series'] = eventId
            category_df.reset_index(inplace=True,drop=True)
            df = pd.concat((df,category_df),0)

    df['label'] = df['label'].astype(int)
    return df

#%%
# obtain samples for the background class
#events_df = pd.read_csv('/home/danikiyasseh/datasets/tlvmc-parkinsons-freezing-gait-prediction/events.csv')
def get_background_df(FILENAMES,metadata_df,DATASOURCE):
    background_df = pd.DataFrame()
    for filename in tqdm(FILENAMES):
        data = pd.read_csv(filename)
        if DATASOURCE == 'realworld': # only consider rows with unambiguous annotations
            data = data[(data['Valid']==True) & (data['Task']==True)]
            data.reset_index(inplace=True,drop=True)
        labels = [-1]*data.shape[0]
        eventId = filename.split('/')[-1].split('.csv')[0]
        subjectId = metadata_df[metadata_df['Id']==eventId]['Subject'].item()
        subjectId = eventId
    
        condition = ~data[CATEGORIES].any(axis=1)
        category_df = data[FEATNAMES][condition]
        category_df['label'] = pd.DataFrame(labels)[condition]
        category_df['subject'] = subjectId
        category_df['series'] = eventId
        category_df.reset_index(inplace=True,drop=True)
        background_df = pd.concat((background_df,category_df),0)
    
    background_df['label'] = background_df['label'].astype(int)
    return background_df

#%%
# prepare samples in dictionary format
def get_data_dict(df,unitConversion=1):
    data_dict = dict()
    for subject in tqdm(df['subject'].unique()):
        data_dict[subject] = defaultdict(list)
        subject_df = df[df['subject']==subject]
        for series in subject_df['series'].unique():
            series_df = subject_df[subject_df['series']==series]
            for category in series_df['label'].unique():
                category_df = series_df[series_df['label']==category]
                if category_df.shape[0] >= SAMPLES: # at least this many samples for this subject from this category
            
                    start = 0
                    end = start + SAMPLES
                    while end <= category_df.shape[0]:
                        chunk_category_df = category_df[start:end]
                        chunk_category_arr = np.array(chunk_category_df[FEATNAMES]) * unitConversion # SAMPLES x NFEATS
                        data_dict[subject][category].append(chunk_category_arr)
                        start = start + jump
                        end = start + SAMPLES
                
        for category in subject_df['label'].unique():
            if len(data_dict[subject][category]) == 0:
                data_dict[subject].pop(category)
            else:
                data_dict[subject][category] = np.stack(data_dict[subject][category]) # NCHUNKS x SAMPLES x NFEATS
    
    subjects_to_keep = [] # remove empty entries
    for subject,data in data_dict.items():
        if data != dict():
            subjects_to_keep.extend([subject])
    new_data_dict = {subject:data_dict[subject] for subject in subjects_to_keep}
    return new_data_dict

# get number of samples from each category
def get_sample_counts(data_dict):
    summary_df = pd.DataFrame(columns=['label','subject'])
    for subject in data_dict.keys():
        categories = data_dict[subject].keys()
        for category in categories:
            data = data_dict[subject][category]
            nsamples = data.shape[0]
            curr_df = pd.DataFrame([category]*nsamples,columns=['label'])
            curr_df['subject'] = subject
            summary_df = pd.concat((summary_df,curr_df),0)
    return summary_df

# calculate final number of samples to obtain a uniform distribution across the classes
def get_subsampled_sample_counts(summary_df,labels=[0,1,2]):
    new_summary_df = pd.DataFrame()
    min_nsamples = summary_df['label'].value_counts().min()
    for category in labels:
        category_df = summary_df[summary_df['label']==category]
        subsampled_category_df = category_df.sample(min_nsamples,random_state=0)
        new_summary_df = pd.concat((new_summary_df,subsampled_category_df),0)
    counts_df = new_summary_df.groupby(by=['subject'])['label'].value_counts()
    counts_df.name = 'count'
    counts_df = counts_df.reset_index()
    return new_summary_df, counts_df

def subsample_data_dict(data_dict,counts_df):
    #get_class_counts(data_dict)
    # subsample data according to above calculated sample numbers
    new_data_dict = dict()
    for subject in data_dict.keys():
        new_data_dict[subject] = dict()
        for category in data_dict[subject].keys():
            combined_bool = (counts_df['subject']==subject) & (counts_df['label']==category)
            if combined_bool.sum() == 0:
                continue
            count = counts_df[combined_bool]['count'].item()
            total_count = data_dict[subject][category].shape[0]
            random_indices = random.sample(list(range(total_count)),count)
            subsampled_data = data_dict[subject][category][random_indices] 
            new_data_dict[subject][category] = subsampled_data
    #get_class_counts(new_data_dict)
    # remove any empty entries
    subjects_to_keep = []
    for subject,data in new_data_dict.items():
        if data != dict():
            subjects_to_keep.extend([subject])
    new_data_dict = {subject:new_data_dict[subject] for subject in subjects_to_keep}
    return new_data_dict

In [3]:
# #%%

# if DATASOURCE == 'lab':
#     DATA_DIR = os.path.join(SOURCE_DIR,'tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog')
#     tdcsfog_metadata_df = pd.read_csv(os.path.join(SOURCE_DIR,'tlvmc-parkinsons-freezing-gait-prediction/tdcsfog_metadata.csv'))
#     FILENAMES = [os.path.join(DATA_DIR,file) for file in os.listdir(DATA_DIR) if '.csv' in file]

#     df = get_multiclass_df(FILENAMES,tdcsfog_metadata_df,DATASOURCE)
    
#     multiclass_data_dict = get_data_dict(df)
#     multiclass_summary_df = get_sample_counts(multiclass_data_dict)
    
# elif DATASOURCE == 'realworld':
#     DATA_DIR = os.path.join(SOURCE_DIR,'tlvmc-parkinsons-freezing-gait-prediction/train/defog')
#     defog_metadata_df = pd.read_csv(os.path.join(SOURCE_DIR,'tlvmc-parkinsons-freezing-gait-prediction/defog_metadata.csv'))
#     FILENAMES = [os.path.join(DATA_DIR,file) for file in os.listdir(DATA_DIR) if '.csv' in file]

#     df = get_multiclass_df(FILENAMES,defog_metadata_df,DATASOURCE)
    
#     multiclass_data_dict = get_data_dict(df,unitConversion=9.81)
#     multiclass_summary_df = get_sample_counts(multiclass_data_dict)

# elif DATASOURCE == 'lab_and_realworld':
#     # LAB data
#     DATA_DIR = os.path.join(SOURCE_DIR,'tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog')
#     tdcsfog_metadata_df = pd.read_csv(os.path.join(SOURCE_DIR,'tlvmc-parkinsons-freezing-gait-prediction/tdcsfog_metadata.csv'))
#     FILENAMES = [os.path.join(DATA_DIR,file) for file in os.listdir(DATA_DIR) if '.csv' in file]

#     df = get_multiclass_df(FILENAMES,tdcsfog_metadata_df,'lab')
    
#     tdcsfog_multiclass_data_dict = get_data_dict(df)
#     tdcsfog_multiclass_summary_df = get_sample_counts(tdcsfog_multiclass_data_dict)
    
#     # REALWORLD data
#     DATA_DIR = os.path.join(SOURCE_DIR,'tlvmc-parkinsons-freezing-gait-prediction/train/defog')
#     defog_metadata_df = pd.read_csv(os.path.join(SOURCE_DIR,'tlvmc-parkinsons-freezing-gait-prediction/defog_metadata.csv'))
#     FILENAMES = [os.path.join(DATA_DIR,file) for file in os.listdir(DATA_DIR) if '.csv' in file]

#     df = get_multiclass_df(FILENAMES,defog_metadata_df,'realworld')
    
#     defog_multiclass_data_dict = get_data_dict(df,unitConversion=9.81)
#     defog_multiclass_summary_df = get_sample_counts(defog_multiclass_data_dict)
    
#     multiclass_data_dict = {**tdcsfog_multiclass_data_dict,**defog_multiclass_data_dict}
#     multiclass_summary_df = pd.concat((tdcsfog_multiclass_summary_df,defog_multiclass_summary_df),0)

In [4]:
# #%%
# """ Subsample the classes """
# subsampled_multiclass_summary_df, subsampled_multiclass_counts_df = get_subsampled_sample_counts(multiclass_summary_df)
# subsampled_multiclass_data_dict = subsample_data_dict(multiclass_data_dict, subsampled_multiclass_counts_df)

In [5]:
# #%%
# if TASK == 'binary': # convert problem to binary classification
#     """ Get background data (from lab only for now) """
#     if DATASOURCE == 'lab':
#         metadata_df = tdcsfog_metadata_df
#     elif DATASOURCE == 'realworld':
#         metadata_df = defog_metadata_df
#     background_df = get_background_df(FILENAMES,metadata_df,DATASOURCE)
#     background_data_dict = get_data_dict(background_df)
    
#     # prepare data dict for binary classification (event vs. no event)
#     background_summary_df = get_sample_counts(background_data_dict)
#     multiclass_summary_df['label'] = multiclass_summary_df['label'].replace({0:1,1:1,2:1})
#     background_summary_df['label'] = 0
#     binary_summary_df = pd.concat((background_summary_df,multiclass_summary_df),0)
#     subsampled_binary_summary_df, subsampled_binary_counts_df = get_subsampled_sample_counts(binary_summary_df,labels=[0,1])
    
#     # add the background data to a combined data dict
#     binary_data_dict = copy.deepcopy(background_data_dict) # more complete list of event series
#     for subject in binary_data_dict.keys():
#         if subject in multiclass_data_dict:
#             binary_data_dict[subject][1] = np.vstack([multiclass_data_dict[subject][cat] for cat in multiclass_data_dict[subject].keys()]) # background originally labelled as -1 (to avoid overlapping with other classes)
#         binary_data_dict[subject][0] = binary_data_dict[subject][-1]
    
#     new_binary_data_dict = dict()
#     for subject in binary_data_dict.keys():
#         new_binary_data_dict[subject] = dict()
#         categories = binary_data_dict[subject].keys()
#         for category in categories:
#             if category in [0,1]:
#                 new_binary_data_dict[subject][category] = binary_data_dict[subject][category]
    
#     # need to get combine_dict (combine multiclass and background dict)
#     subsampled_binary_data_dict = subsample_data_dict(new_binary_data_dict, subsampled_binary_counts_df)

In [6]:
# #%%
# if TASK == 'multiclass':
#     with open('balanced_multiclass_data_dict','wb') as f:
#         pickle.dump(subsampled_multiclass_data_dict,f)
# elif TASK == 'binary':
#     with open('balanced_binary_data_dict','wb') as f:
#         pickle.dump(subsampled_binary_data_dict,f)

In [7]:
#%%
""" inspect number of samples from each class """
def get_class_counts(multiclass_data_dict):
    counts = {i:0 for i in [-1,0,1,2]}
    for key in multiclass_data_dict.keys():
        for cat in multiclass_data_dict[key].keys():
            counts[cat] += multiclass_data_dict[key][cat].shape[0]
    print(counts)

#%%
def data_generator(subjects,data_dict,SAMPLES,DATASOURCE):
    #random.shuffle(subjects)
    for subject in subjects:
        #subject = subject.decode("utf-8") # tf encodes input string to utf-8 (therefore you must decode it)
        #assert isinstance(data_dict,dict)
        if subject in data_dict:
            categories_dict = data_dict[subject] 
            for category in categories_dict.keys():
                data = categories_dict[category]
                if isinstance(data,np.ndarray):
                    nchunks = data.shape[0]
                    for i in range(nchunks):
                        input_data = categories_dict[category][i] # 256 x 3
                        #channel_mean = np.mean(input_data,axis=0)
                        #channel_std = np.std(input_data,axis=0)
                        #input_data = (input_data - channel_mean)/channel_std
                        b,a = signal.butter(2, 15, 'low', fs=128 if DATASOURCE == 'lab' else 100)
                        input_data = signal.lfilter(b,a,input_data,axis=0)
                        output_data = [category]*SAMPLES 
                        yield tf.constant(input_data), tf.constant(output_data) 

In [8]:
def load_data_dict(DATASOURCE,TASK):
    if TASK == 'multiclass':
        if DATASOURCE == 'lab':
            folder = 'tdcsfog-multiclass'
        with open('/kaggle/input/%s/balanced_multiclass_data_dict' % folder,'rb') as f:
            data_dict = pickle.load(f)
    elif TASK == 'binary':
        if DATASOURCE == 'lab':
            folder = 'tcdsfog-binary'
        elif DATASOURCE == 'realworld':
            folder = 'defog-binary'
        with open('/kaggle/input/%s/balanced_binary_data_dict' % folder,'rb') as f:
            data_dict = pickle.load(f)
    return data_dict

In [9]:
config = {
    'realworld': {
        'binary': {
            'epochs': 1, # 5
            'batch_size': 128,
            'learning_rate': 1e-3,
            'folds': 1,
            'splits':[0.6,0.2,0.2],
            'model': tf.keras.models.Sequential([ 
                tf.keras.layers.InputLayer(input_shape=(150, 3)),
                tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)), # returns output at each time-step (i.e., many to many setup)
                tf.keras.layers.Dense(units=1)
            ])
        }
    },
    'lab': {
        'binary': {
            'epochs': 1, # 50 = best
            'batch_size': 128,
            'learning_rate': 1e-3,
            'folds': 1,
            'splits':[0.6,0.2,0.2],
            'model': tf.keras.models.Sequential([ 
                            tf.keras.layers.InputLayer(input_shape=(192, 3)),
                            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)), # returns output at each time-step (i.e., many to many setup)
                            tf.keras.layers.Dense(units=1)
                        ])
        },
        'multiclass': {
            'epochs': 1, # 25
            'batch_size': 16,
            'learning_rate': 1e-4,
            'folds': 1,
            'splits':[0.8,0.1,0.1],
            'model': tf.keras.models.Sequential([ 
                            tf.keras.layers.InputLayer(input_shape=(192, 3)),
                            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True)), # returns output at each time-step (i.e., many to many setup)
                            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True)), # returns output at each time-step (i.e., many to many setup)
                            tf.keras.layers.Dense(units=3)
                        ])
        }
    }
}

In [10]:
class multiclassAUPRC(tf.keras.metrics.AUC):

    def __init__(self,**kwargs): # you need to have the kwargs here to be able to load it in later
        super(multiclassAUPRC,self).__init__(from_logits=True,curve='PR')

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.one_hot(y_true,depth=3)
        super().update_state(y_true, y_pred)

In [11]:
#preds = model.predict(a).squeeze()
#labels = list(val_data.take(1))[0][1].numpy()

In [12]:
# val_data = val_data.shuffle(100,seed=0)
# for a,b in val_data.as_numpy_iterator():
#     if 0 in b and b.sum() > 32*150:
#         break

In [13]:
# preds2 = preds.reshape(-1,1)[:,0]
# labels2 = labels.reshape(-1,1)[:,0]
# df = pd.DataFrame([preds2,labels2]).T #,columns=['Pred','Label'])
# df.columns = ['Pred','Label']
# sns.histplot(x='Pred',hue='Label',data = df)

In [14]:
#%%
for DATASOURCE in ['lab']:#,'realworld']:
    if DATASOURCE == 'lab':
        TASKS = ['multiclass'] #['binary','multiclass']
    elif DATASOURCE == 'realworld':
        TASKS = ['binary']
    for TASK in TASKS:
        if TASK == 'multiclass':
            labels = [0,1,2]
        elif TASK == 'binary':
            labels = [0,1]
        data_dict = load_data_dict(DATASOURCE,TASK)
        for fold in range(config[DATASOURCE][TASK]['folds']):  
            savefolder = 'defog' if DATASOURCE == 'realworld' else 'tdcsfog'
            savetask = TASK
            savepath = os.path.join('/kaggle/working',savefolder,savetask)
            SAMPLES = 150 if 'realworld' in DATASOURCE else 192

            random.seed(fold)
            subjects = list(data_dict.keys())
            random.shuffle(subjects)
            nsubjects = len(subjects)
            train_frac, val_frac, test_frac = config[DATASOURCE][TASK]['splits']
            train_nsubjects, val_nsubjects = int(train_frac*nsubjects), int(val_frac*nsubjects)
            train_subjects, val_subjects, test_subjects = subjects[:train_nsubjects], subjects[train_nsubjects:train_nsubjects+val_nsubjects], subjects[train_nsubjects+val_nsubjects:] 

            train_data_dict = {subject:data_dict[subject] for subject in train_subjects}
            summary_df = get_sample_counts(train_data_dict)
            subsampled_summary_df, subsampled_counts_df = get_subsampled_sample_counts(summary_df,labels=labels)
            train_data_dict = subsample_data_dict(train_data_dict, subsampled_counts_df)

            val_data_dict = {subject:data_dict[subject] for subject in val_subjects}
            summary_df = get_sample_counts(val_data_dict)
            subsampled_summary_df, subsampled_counts_df = get_subsampled_sample_counts(summary_df,labels=labels)
            val_data_dict = subsample_data_dict(val_data_dict, subsampled_counts_df)

            test_data_dict = {subject:data_dict[subject] for subject in test_subjects}
            summary_df = get_sample_counts(test_data_dict)
            subsampled_summary_df, subsampled_counts_df = get_subsampled_sample_counts(summary_df,labels=labels)
            test_data_dict = subsample_data_dict(test_data_dict, subsampled_counts_df)

            train_data = tf.data.Dataset.from_generator(lambda: data_generator(train_subjects,train_data_dict,SAMPLES,DATASOURCE), # args=[x,y,z]
                                                        output_signature=(
                                                           tf.TensorSpec(shape=(SAMPLES,3), dtype=tf.float64),
                                                           tf.TensorSpec(shape=(SAMPLES), dtype=tf.int32))
                                                       ) # shape is at the individual tensor level (not batch)

            val_data = tf.data.Dataset.from_generator(lambda: data_generator(val_subjects,val_data_dict,SAMPLES,DATASOURCE),
                                                        output_signature=(
                                                           tf.TensorSpec(shape=(SAMPLES,3), dtype=tf.float64),
                                                           tf.TensorSpec(shape=(SAMPLES), dtype=tf.int32))
                                                       ) # shape is at the individual tensor level (not batch)

            test_data = tf.data.Dataset.from_generator(lambda: data_generator(test_subjects,test_data_dict,SAMPLES,DATASOURCE),
                                                        output_signature=(
                                                           tf.TensorSpec(shape=(SAMPLES,3), dtype=tf.float64),
                                                           tf.TensorSpec(shape=(SAMPLES), dtype=tf.int32))
                                                       )

            print('Training Class Distribution...')
            get_class_counts(train_data_dict)
            print('Validation Class Distribution...')
            get_class_counts(val_data_dict)

            train_data = train_data.batch(batch_size=config[DATASOURCE][TASK]['batch_size']) # 16
            train_data = train_data.shuffle(100,seed=fold)

            val_data = val_data.batch(batch_size=128) # 8
            test_data = test_data.batch(batch_size=128)

            lstm_model = config[DATASOURCE][TASK]['model'] 

            if TASK == 'multiclass':
                lstm_model.compile(
                              optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                              metrics=['accuracy',multiclassAUPRC()])
            elif TASK == 'binary':
                lstm_model.compile(
                              optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                              metrics=['accuracy',tf.keras.metrics.AUC()])

            lstm_model.fit(
                x = train_data,
                validation_data = val_data,
                epochs = config[DATASOURCE][TASK]['epochs'],
                callbacks=[
                    tf.keras.callbacks.EarlyStopping(monitor='val_loss',min_delta=0.001,patience=5),
                    tf.keras.callbacks.TensorBoard('./logs', update_freq=1),
                    tf.keras.callbacks.ModelCheckpoint(
                                        filepath=os.path.join(savepath,'checkpoint_fold%i' % fold),
                                        save_weights_only=True,
                                        monitor='val_accuracy',
                                        mode='max',
                                        save_best_only=True)
                                                ]
                )



Training Class Distribution...
{-1: 0, 0: 3419, 1: 3419, 2: 3419}
Validation Class Distribution...
{-1: 0, 0: 353, 1: 353, 2: 353}


In [15]:
#lstm_model.load_weights('./tmp/checkpoint_fold0')
#lstm_model.evaluate(test_data)
#%%
#lstm_model.save(os.path.join(SAVE_DIR,'lstm_binary_parkinsons'))
#%%
#lstm_model = tf.keras.models.load_model(os.path.join(SAVE_DIR,'lstm_parkinsons'),custom_objects={"multiclassAUPRC":multiclassAUPRC})

In [16]:
# #%%
import glob
labpaths = glob.glob("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/tdcsfog/*.csv")
realpaths = glob.glob("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/defog/*.csv")
test_paths = labpaths + realpaths
from collections import OrderedDict 

all_preds_df = pd.DataFrame()
dict_preds_df = OrderedDict()
for f in test_paths:
    
    df = pd.read_csv(f)
    df.set_index('Time', drop=True, inplace=True)
    df['Id'] = f.split('/')[-1].split('.')[0]
    df['Id'] = df['Id'].astype(str) + '_' + df.index.astype(str)
    
    for task in ['multiclass']: #binary',
        curr_preds_df = pd.DataFrame()
        if 'tdcsfog' in f:
            #model = config['lab'][task]['model']
            #model.load_weights(os.path.join('/kaggle/working','tdcsfog',task,'checkpoint_fold0'))
            model = copy.deepcopy(lstm_model)
            SAMPLES, jump = 192, 38
            unitConversion = 1
        elif 'defog' in f:
            unitConversion = 9.81
            if task == 'binary':
                #model = config['realworld'][task]['model']
                #model.load_weights(os.path.join('/kaggle/working','defog',task,'checkpoint_fold0'))
                model = copy.deepcopy(lstm_model)
                SAMPLES, jump = 150, 30
            elif task == 'multiclass': #use the tdcsfog (lab) model for this case (might not be ideal)
                #model = config['lab'][task]['model']
                #model.load_weights(os.path.join('/kaggle/working','tdcsfog',task,'checkpoint_fold0'))
                model = copy.deepcopy(lstm_model)
                SAMPLES, jump = 192, 38

        if task == 'binary':
            CATEGORIES = ['Event']
        elif task == 'multiclass':
            CATEGORIES = ['StartHesitation','Turn','Walking'] # 3 classes
    
        start = 0
        end = start + SAMPLES
        chunks_list = []
        while end <= df.shape[0]:
            chunk_df = df.iloc[start:end,:]
            chunk_arr = np.array(chunk_df[FEATNAMES]) # SAMPLES x NFEATS
            b,a = signal.butter(2, 15, 'low', fs=128 if 'tdcsfog' in f else 100)
            chunk_arr = signal.lfilter(b,a,chunk_arr,axis=0)
            chunk_arr = chunk_arr * unitConversion
            chunk_arr = np.expand_dims(chunk_arr,0) # 1 x SAMPLES x NFEATS
            chunks_list.append(chunk_arr)
            start = start + SAMPLES 
            end = start + SAMPLES
        
        all_chunks = np.vstack(chunks_list)
        batch_size = 128
        nbatches = all_chunks.shape[0] // batch_size
        remainder = all_chunks.shape[0] % batch_size
        if remainder > 0:
            nbatches += 1
        count = 0 
        for i in range(nbatches):
            start = i*batch_size
            end = start + batch_size
            chunk_arr = all_chunks[start:end,:,:] # B x SAMPLES x NFEATS
            preds = model.predict(chunk_arr) # B x SAMPLES x NCLASSES
            preds_df = pd.DataFrame(preds.reshape(-1,len(CATEGORIES)),columns=CATEGORIES)
            curr_preds_df = pd.concat((curr_preds_df,preds_df),0)
            count += preds.shape[0]*SAMPLES

        # make sure to cover the final (smaller batch)
        final_nsamples = df.shape[0] - count
        chunk_df = df.iloc[-SAMPLES:,:]
        chunk_arr = np.array(chunk_df[FEATNAMES]) # SAMPLES x NFEATS
        b,a = signal.butter(2, 15, 'low', fs=128 if 'tdcsfog' in f else 100)
        chunk_arr = signal.lfilter(b,a,chunk_arr,axis=0)
        chunk_arr = chunk_arr * unitConversion
        chunk_arr = np.expand_dims(chunk_arr,0) # 1 x SAMPLES x NFEATS
        preds = model.predict(chunk_arr)
        preds_df = pd.DataFrame(preds.reshape(-1,len(CATEGORIES)),columns=CATEGORIES)
        preds_df = preds_df[-final_nsamples:]
        curr_preds_df = pd.concat((curr_preds_df,preds_df),0)
        curr_preds_df.index = df.index
        curr_preds_df['Id'] = df['Id']
        
        setting = 'defog' if 'defog' in f else 'tdcsfog'
        filename = f.split('/')[-1].split('.')[0]
        key = filename + '-' + setting + '-' + task
        dict_preds_df[key] = curr_preds_df

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......bidirectional
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........forward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........layer
............cell
...............vars
............vars
.........vars
......bidirectional_1
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........forward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........layer
............cell
...............vars
............vars
.........vars
......dense
.........vars
............0
............1
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
........



Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......bidirectional
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........forward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........layer
............cell
...............vars
............vars
.........vars
......bidirectional_1
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........forward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........layer
............cell
...............vars
............vars
.........vars
......dense
.........vars
............0
............1
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
........

In [17]:
# # post processing the binary and multiclass predictions
all_preds_df = pd.DataFrame()
for key in dict_preds_df.keys():
    filename,setting,task = key.split('-')
    if task == 'multiclass':
        #binary_preds_df = dict_preds_df[key.replace('multiclass','binary')]
        multi_preds_df = dict_preds_df[key]
        #multi_preds_df['event_prediction'] = binary_preds_df['Event']>0 # bool indicating presence of FOG event
        #multi_preds_df.loc[multi_preds_df['event_prediction']==False,['StartHesitation','Turn','Walking']] = 0
        multi_preds_df = multi_preds_df[['Id','StartHesitation','Turn','Walking']]
        all_preds_df = pd.concat((all_preds_df,multi_preds_df),0)

  # This is added back by InteractiveShellApp.init_path()


In [18]:
# labpaths = glob.glob("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/tdcsfog/*.csv")
# realpaths = glob.glob("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/defog/*.csv")
# test_paths = labpaths + realpaths
# SAVE_DIR = '/kaggle/working/'
# submission = []
# for f in test_paths:
#     df = pd.read_csv(f)
#     df.set_index('Time', drop=True, inplace=True)
#     df['Id'] = f.split('/')[-1].split('.')[0]
#     df['Id'] = df['Id'].astype(str) + '_' + df.index.astype(str)
#     submission.append(df)
# submission = pd.concat(submission)
# preds_df = pd.DataFrame(submission['Id'])
# preds_df[['StartHesitation','Turn','Walking']] = 0
#preds_df.to_csv('submission.csv', index=False)

In [19]:
p = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/'
sub = pd.read_csv(p+'sample_submission.csv')
sub['t'] = 0
submission = pd.merge(sub[['Id','t']], all_preds_df, how='left', on='Id').fillna(0.0)
submission[['Id','StartHesitation', 'Turn' , 'Walking']].to_csv('submission.csv', index=False)