## Data loading

In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn import *
import glob

In [2]:
p = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/'

In [3]:
train = glob.glob(p+'train/**/**')
test = glob.glob(p+'test/**/**')

In [4]:
len(train),len(test)

(970, 2)

In [5]:
subjects = pd.read_csv(p+'subjects.csv')
tasks = pd.read_csv(p+'tasks.csv')
sub = pd.read_csv(p+'sample_submission.csv')

In [6]:
tdcsfog_metadata=pd.read_csv(p+'tdcsfog_metadata.csv')
defog_metadata=pd.read_csv(p+'defog_metadata.csv')

In [7]:
tdcsfog_metadata.head(2)

Unnamed: 0,Id,Subject,Visit,Test,Medication
0,003f117e14,13abfd,3,2,on
1,009ee11563,d81e3a,4,2,on


In [8]:
defog_metadata.head(2)

Unnamed: 0,Id,Subject,Visit,Medication
0,02ab235146,ab54e1,2,on
1,02ea782681,bf608b,2,on


In [9]:
tdcsfog_metadata['Module']='tdcsfog'
defog_metadata['Module']='defog'

In [10]:
metadata=pd.concat([tdcsfog_metadata,defog_metadata])

In [11]:
metadata.head()

Unnamed: 0,Id,Subject,Visit,Test,Medication,Module
0,003f117e14,13abfd,3,2.0,on,tdcsfog
1,009ee11563,d81e3a,4,2.0,on,tdcsfog
2,011322847a,203e85,2,2.0,on,tdcsfog
3,01d0fe7266,203e85,2,1.0,off,tdcsfog
4,024418ba39,cecfb8,19,3.0,on,tdcsfog


### Tasks

In [12]:
tasks.head(2)

Unnamed: 0,Id,Begin,End,Task
0,02ab235146,10.0,190.48,Rest1
1,02ab235146,211.24,271.56,Rest2


In [13]:
tasks['Duration'] = tasks['End'] - tasks['Begin']

In [14]:
tasks.head(2)

Unnamed: 0,Id,Begin,End,Task,Duration
0,02ab235146,10.0,190.48,Rest1,180.48
1,02ab235146,211.24,271.56,Rest2,60.32


In [15]:
len(tasks.Id.unique()),len(tasks.Task.unique())

(137, 31)

In [16]:
tasks = pd.pivot_table(tasks, values=['Duration'], index=['Id'], columns=['Task'], aggfunc='sum', fill_value=0) 

In [17]:
tasks.head(2)

Unnamed: 0_level_0,Duration,Duration,Duration,Duration,Duration,Duration,Duration,Duration,Duration,Duration,Duration,Duration,Duration,Duration,Duration,Duration,Duration,Duration,Duration,Duration,Duration
Task,4MW,4MW-C,Hotspot1,Hotspot1-C,Hotspot2,Hotspot2-C,MB1,MB10,MB11,MB12,...,MB8,MB9,Rest1,Rest2,TUG-C,TUG-DT,TUG-ST,Turning-C,Turning-DT,Turning-ST
Id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
02ab235146,16.52,16.68,16.76,16.24,53.92,64.6,13.96,17.96,17.4,8.6,...,12.84,30.8,180.48,60.32,38.44,47.92,36.24,21.92,46.4,23.32
02ea782681,11.618,11.796,11.525,11.692,8.329,9.032,3.469,6.624,6.23,5.379,...,11.849,30.65,0.0,0.0,18.343,19.932,20.13,18.042,21.588,18.698


In [18]:
tasks.columns = [c[-1] for c in tasks.columns]  ## To delete 'Duration' column in 1st row from heading

In [19]:
tasks = tasks.reset_index()

In [20]:
tasks['t_kmeans'] = cluster.KMeans(n_clusters=10, random_state=293).fit_predict(tasks[tasks.columns[1:]])

In [21]:
tasks.head(2)

Unnamed: 0,Id,4MW,4MW-C,Hotspot1,Hotspot1-C,Hotspot2,Hotspot2-C,MB1,MB10,MB11,...,MB9,Rest1,Rest2,TUG-C,TUG-DT,TUG-ST,Turning-C,Turning-DT,Turning-ST,t_kmeans
0,02ab235146,16.52,16.68,16.76,16.24,53.92,64.6,13.96,17.96,17.4,...,30.8,180.48,60.32,38.44,47.92,36.24,21.92,46.4,23.32,1
1,02ea782681,11.618,11.796,11.525,11.692,8.329,9.032,3.469,6.624,6.23,...,30.65,0.0,0.0,18.343,19.932,20.13,18.042,21.588,18.698,4


### Subjects

In [22]:
subjects

Unnamed: 0,Subject,Visit,Age,Sex,YearsSinceDx,UPDRSIII_On,UPDRSIII_Off,NFOGQ
0,04fcdb,1.0,63,M,3.0,30.0,,0
1,05595e,1.0,56,M,8.0,28.0,,0
2,0967b2,1.0,59,M,10.0,38.0,48.0,19
3,0967b2,2.0,59,M,10.0,37.0,44.0,13
4,097078,,70,F,10.0,27.0,50.0,20
...,...,...,...,...,...,...,...,...
168,f90887,1.0,72,M,16.0,35.0,46.0,26
169,fc1e1b,1.0,82,F,11.0,38.0,42.0,21
170,fe5d84,2.0,72,M,14.0,32.0,45.0,17
171,fe5d84,1.0,72,F,14.0,13.0,33.0,15


In [23]:
subjects = subjects.fillna(0).groupby('Subject').median()

In [24]:
subjects.info()

<class 'pandas.core.frame.DataFrame'>
Index: 136 entries, 04fcdb to ffa798
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Visit         136 non-null    float64
 1   Age           136 non-null    float64
 2   YearsSinceDx  136 non-null    float64
 3   UPDRSIII_On   136 non-null    float64
 4   UPDRSIII_Off  136 non-null    float64
 5   NFOGQ         136 non-null    float64
dtypes: float64(6)
memory usage: 7.4+ KB


In [25]:
subjects = subjects.reset_index()

In [26]:
subjects.rename(columns={'Subject':'Id'}, inplace=True)

In [27]:
subjects['s_kmeans'] = cluster.KMeans(n_clusters=10, random_state=293).fit_predict(subjects[subjects.columns[1:]])

In [28]:
def reader(f):
    try:
        df = pd.read_csv(f, usecols=['Time', 'AccV', 'AccML', 'AccAP', 'StartHesitation', 'Turn' , 'Walking'])
        df['Id'] = f.split('/')[-1].split('.')[0]
        #df = pd.merge(df, tasks[['Id','t_kmeans']], how='left', on='Id').fillna(-1)
        #df = pd.merge(df, subjects[['Id','s_kmeans']], how='left', on='Id').fillna(-1)
        df['AccV2'] = df['AccV'] - np.median(df['AccV'])
        df['AccML2'] = df['AccML'] - np.median(df['AccML'])
        df['AccAP2'] = df['AccAP'] - np.median(df['AccAP'])
        
        df['AccV2_s'] = df['AccV'] - df['AccV'].shift(1)
        df['AccML2_s'] = df['AccML'] - df['AccML'].shift(1)
        df['AccAP2_s'] = df['AccAP'] - df['AccAP'].shift(1)
    
        df['AccV2_sm'] = df['AccV'] - df['AccV'].shift(1).rolling(5).mean()
        df['AccML2_sm'] = df['AccML'] - df['AccML'].shift(1).rolling(5).mean()
        df['AccAP2_sm'] = df['AccAP'] - df['AccAP'].shift(1).rolling(5).mean()
        
        df = df.fillna(99)
        return df
    except: pass

In [29]:
train = pd.concat([reader(f) for f in train]).fillna(0)

In [30]:
train

Unnamed: 0,Time,AccV,AccML,AccAP,StartHesitation,Turn,Walking,Id,AccV2,AccML2,AccAP2,AccV2_s,AccML2_s,AccAP2_s,AccV2_sm,AccML2_sm,AccAP2_sm
0,0,-1.002697,0.022371,0.068304,0,0,0,be9d33541d,-0.018056,0.007508,-0.073567,99.000000,99.000000,99.000000,99.000000,99.000000,99.000000
1,1,-1.002641,0.019173,0.066162,0,0,0,be9d33541d,-0.018000,0.004310,-0.075708,0.000056,-0.003198,-0.002142,99.000000,99.000000,99.000000
2,2,-0.999820,0.019142,0.067536,0,0,0,be9d33541d,-0.015179,0.004279,-0.074335,0.002821,-0.000031,0.001374,99.000000,99.000000,99.000000
3,3,-0.998023,0.018378,0.068409,0,0,0,be9d33541d,-0.013382,0.003515,-0.073461,0.001797,-0.000764,0.000874,99.000000,99.000000,99.000000
4,4,-0.998359,0.016726,0.066448,0,0,0,be9d33541d,-0.013717,0.001863,-0.075423,-0.000336,-0.001652,-0.001961,99.000000,99.000000,99.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5153,5153,-9.915920,-0.105897,-1.123455,0,0,0,0506d9a39f,-0.562312,0.166628,-3.640620,0.207943,-0.005537,-0.006615,-0.386098,0.053607,-0.012213
5154,5154,-9.693752,-0.066892,-1.114903,0,0,0,0506d9a39f,-0.340144,0.205633,-3.632069,0.222167,0.039005,0.008551,-0.082525,0.073279,-0.006012
5155,5155,-9.548118,-0.098315,-1.112123,0,0,0,0506d9a39f,-0.194509,0.174209,-3.629288,0.145635,-0.031424,0.002781,0.127065,0.018766,0.003074
5156,5156,-9.469803,-0.111004,-1.130814,0,0,0,0506d9a39f,-0.116195,0.161520,-3.647979,0.078315,-0.012689,-0.018691,0.274570,-0.010809,-0.017962


In [31]:
cols = [c for c in train.columns if c not in ['Id', 'StartHesitation', 'Turn' , 'Walking', 'Valid', 'Task','Event']]

In [32]:
pcols = ['StartHesitation', 'Turn' , 'Walking']

In [33]:
scols = ['Id', 'StartHesitation', 'Turn' , 'Walking']

In [34]:
x1, x2, y1, y2 = model_selection.train_test_split(train[cols], train[pcols], test_size=.30, random_state=3, stratify=train[pcols])

In [35]:
del train

In [36]:
reg = ensemble.ExtraTreesRegressor(n_estimators=200, max_depth=7, n_jobs=-1, random_state=3)

In [37]:
reg.fit(x1[:5_000_000],y1[:5_000_000])

ExtraTreesRegressor(max_depth=7, n_estimators=200, n_jobs=-1, random_state=3)

In [38]:
print(metrics.average_precision_score(y2, reg.predict(x2).clip(0.0,1.0)))

0.3603862856783766


In [39]:
sub['t'] = 0
submission = []

In [40]:
for f in test:
    df = pd.read_csv(f)
    df['Id'] = f.split('/')[-1].split('.')[0]
    df = df.fillna(0).reset_index(drop=True)
    #df = pd.merge(df, tasks[['Id','t_kmeans']], how='left', on='Id').fillna(-1)
    #df = pd.merge(df, subjects[['Id','s_kmeans']], how='left', on='Id').fillna(-1)
    df['AccV2'] = df['AccV'] - np.median(df['AccV'])
    df['AccML2'] = df['AccML'] - np.median(df['AccML'])
    df['AccAP2'] = df['AccAP'] - np.median(df['AccAP'])
    df['AccV2_s'] = df['AccV'] - df['AccV'].shift(1)
    df['AccML2_s'] = df['AccML'] - df['AccML'].shift(1)
    df['AccAP2_s'] = df['AccAP'] - df['AccAP'].shift(1)
    df['AccV2_sm'] = df['AccV'] - df['AccV'].shift(1).rolling(5).mean()
    df['AccML2_sm'] = df['AccML'] - df['AccML'].shift(1).rolling(5).mean()
    df['AccAP2_sm'] = df['AccAP'] - df['AccAP'].shift(1).rolling(5).mean()
    df = df.fillna(99)
    res = pd.DataFrame(reg.predict(df[cols]).clip(0.0,1.0), columns=pcols)
    df = pd.concat([df,res], axis=1)
    df['Id'] = df['Id'].astype(str) + '_' + df['Time'].astype(str)
    submission.append(df[scols])

In [41]:
submission = pd.concat(submission)

In [42]:
sub

Unnamed: 0,Id,StartHesitation,Turn,Walking,t
0,003f117e14_0,0,0,0,0
1,003f117e14_1,0,0,0,0
2,003f117e14_2,0,0,0,0
3,003f117e14_3,0,0,0,0
4,003f117e14_4,0,0,0,0
...,...,...,...,...,...
286365,02ab235146_281683,0,0,0,0
286366,02ab235146_281684,0,0,0,0
286367,02ab235146_281685,0,0,0,0
286368,02ab235146_281686,0,0,0,0


In [43]:
submission = pd.merge(sub[['Id','t']], submission, how='left', on='Id').fillna(0.0)

In [44]:
submission 

Unnamed: 0,Id,t,StartHesitation,Turn,Walking
0,003f117e14_0,0,0.011363,0.043270,0.002876
1,003f117e14_1,0,0.011338,0.043239,0.002876
2,003f117e14_2,0,0.011338,0.043239,0.002876
3,003f117e14_3,0,0.011338,0.043239,0.002876
4,003f117e14_4,0,0.011338,0.043239,0.002876
...,...,...,...,...,...
286365,02ab235146_281683,0,0.000274,0.027678,0.021871
286366,02ab235146_281684,0,0.000274,0.027678,0.021871
286367,02ab235146_281685,0,0.000274,0.027678,0.021871
286368,02ab235146_281686,0,0.000274,0.027678,0.021871


In [45]:
subjects=subjects.rename(columns={'Visit':'s_Visit','Age':'s_Age','YearsSinceDx':'s_YearsSinceDx','UPDRSIII_On':'s_UPDRSIII_On','UPDRSIII_Off':'s_UPDRSIII_Off','NFOGQ':'s_NFOGQ'})

In [46]:
submission[scols].to_csv('submission.csv', index=False)