parse raw files and produce a one keystroke_data.csv file with columns:

['userkey', 'parkinsons', 'timestep', 'hold_time', 'latency_time',
    'flight_time', 'tremors', 'levadopa', 'da', 'maob', 'other']

to run, data and users zip files must be in the same directory (they can be downloaded from https://physionet.org/content/tappy/1.0.0/)

In [1]:
from zipfile import ZipFile
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from tqdm import tqdm
import ast

tqdm.pandas()

In [2]:
def load_keystroke_data():
    columns = ["userkey", "date", "timestamp", "hand", "hold_time", "direction", "latency_time", "flight_time"]
    data_df = pd.DataFrame(columns=columns)

    # Load keystroke data
    with ZipFile('Archived-Data.zip') as data:
        print('Loading Keystroke Data...')
        data_files_list = list(filter(lambda file_info: file_info.filename.endswith('txt'),data.infolist()))
        data_entries = []
        for data_file in data_files_list:
            with data.open(data_file.filename) as datafile:
                df = pd.read_csv(datafile, sep='\t', header=None, dtype=str, names=columns, index_col=False, na_values=['NA'], low_memory=False, on_bad_lines='skip')
                data_entries.append(df)
        data_df = pd.concat(data_entries, ignore_index=True)

    # Drop missing rows
    data_df = data_df.dropna()

    def is_valid_row(row):
        if not (
            len(str(row["userkey"])) == 10
            and len(str(row["date"])) == 6
            and (len(str(row["hand"])) == 1)
            and (len(str(row["direction"])) == 2)
            and (not str(row['hold_time']).isdigit())
            and (len(str(row['hold_time'])) == 6)
            and (not str(row['latency_time']).isdigit())
            and (len(str(row['latency_time'])) == 6)
            and (not str(row['flight_time']).isdigit())
            and (len(str(row['flight_time'])) == 6)
        ):
            return False

        return True

    print('Cleaning Keystroke Data...')
    # Drop corrupted rows
    valid_rows = data_df.progress_apply(is_valid_row, axis=1)

    # Keep only the valid rows
    data_df = data_df[valid_rows]

    def parse_data_df(df):
        print('Parsing Keystroke Data...')
        num_cols = ['hold_time', 'latency_time', 'flight_time']
        df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

        df.date = pd.to_datetime(df.date, format='%y%m%d', errors='coerce')
        df.timestamp = pd.to_datetime(df.timestamp, format='%H:%M:%S.%f', errors='coerce').dt.time

        cat_cols = ['userkey', 'hand', 'direction']
        df[cat_cols] = df[cat_cols].astype('category')

        return df

    data_df = parse_data_df(data_df)

    return data_df

In [3]:
data_df = load_keystroke_data()

Loading Keystroke Data...
Cleaning Keystroke Data...


100%|██████████| 9316855/9316855 [01:40<00:00, 93026.84it/s]


Parsing Keystroke Data...


In [4]:
data_df.head()

Unnamed: 0,userkey,date,timestamp,hand,hold_time,direction,latency_time,flight_time
0,0EA27ICBLF,2016-07-22,18:41:04.336000,L,101.6,LL,234.4,156.3
1,0EA27ICBLF,2016-07-22,18:42:14.070000,L,85.9,LL,437.5,359.4
2,0EA27ICBLF,2016-07-22,18:42:14.273000,L,78.1,LL,210.9,125.0
3,0EA27ICBLF,2016-07-22,18:42:14.617000,L,62.5,LL,359.4,281.3
4,0EA27ICBLF,2016-07-22,18:42:15.586000,S,125.0,LS,187.5,93.8


In [5]:
def load_users_data():
    users_fp = 'Archived-users.zip'
    with ZipFile(users_fp) as users:
        print('Loading Users Data...')
        users_files_list = list(filter(lambda file_info: file_info.filename.endswith('txt'),users.infolist()))
        user_entries = []
        for user_file in users_files_list:
            with users.open(user_file) as uf:
                user_data = uf.read().decode("utf-8").strip().split("\n")
                user_entry = {'UserKey': uf.name.split('/')[1].split('_')[1].split('.')[0]}
                user_entry.update({line[0]: line[1].strip() for line in [line.split(":") for line in user_data]})
                user_entries.append(user_entry)
        users_df = pd.DataFrame(user_entries)
        users_df.columns = users_df.columns.str.lower()

    def parse_users_df(df):
        print('Parsing Users Data...')
        num_cols = ['birthyear', 'diagnosisyear', 'updrs']
        df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

        bool_cols = ['parkinsons', 'tremors', 'levadopa', 'da', 'maob', 'other']
        df[bool_cols] = df[bool_cols].applymap(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

        df.impact = df.impact.replace('------', None) 
        cat_cols = ['userkey', 'gender', 'impact', 'sided']
        df[cat_cols] = df[cat_cols].astype('category')

        return df
    
    users_df = parse_users_df(users_df)

    return users_df

In [6]:
users_df = load_users_data()

Loading Users Data...
Parsing Users Data...


In [7]:
users_df.head()

Unnamed: 0,userkey,birthyear,gender,parkinsons,tremors,diagnosisyear,sided,updrs,impact,levadopa,da,maob,other
0,0EA27ICBLF,1952.0,Female,True,True,2000.0,Left,,Severe,True,True,False,False
1,0QAZFRHQHW,1959.0,Female,False,False,,,,,False,False,False,False
2,0WTDIGPSBZ,1946.0,Female,False,False,,,,,False,False,False,False
3,1HOEBIGASW,1944.0,Male,False,False,,,,,False,False,False,False
4,1WMVCCU4RH,1953.0,Male,True,True,2017.0,Left,,Medium,False,False,False,False


merging keystrokes with users data and encoding the userkey and gender columns

In [8]:
df = data_df.merge(users_df, on='userkey')
df.columns = [col.lower().replace(' ', '_') for col in df.columns]
df['timestep'] = df.groupby('userkey').timestamp.cumcount()

le = LabelEncoder()
df['userkey'] = le.fit_transform(df['userkey'])
ge = LabelEncoder()
df['gender'] = ge.fit_transform(df['gender'])

creating a feature "on_meds" if any of parkinsons medication is taken by a participant

In [9]:
df['on_meds'] = df[['levadopa', 'da', 'maob', 'other']].any(axis=1)

removing outliers and minmax scaling

In [13]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

main_columns = ['hold_time', 'latency_time', 'flight_time']
def remove_outliers(group):
    for col in main_columns:
        q1 = group[col].quantile(0.25)
        q3 = group[col].quantile(0.75)
        iqr = q3 - q1
        upper_bound = q3 + 1.5 * iqr
        lower_bound = q1 - 1.5 * iqr
        group[col] = np.where(group[col] > upper_bound, np.nan, group[col])
        group[col] = np.where(group[col] < lower_bound, np.nan, group[col])

    return group

df = df.groupby('userkey', group_keys=True).progress_apply(remove_outliers).reset_index(drop=True)

scaler = MinMaxScaler()
df.loc[:, main_columns] = scaler.fit_transform(df[main_columns])

100%|██████████| 217/217 [00:03<00:00, 68.29it/s] 


saving dataframe as csv and keeping relevant columns only

In [54]:
kept_columns = ['userkey', 'parkinsons', 'timestep', 'hand','direction', 'impact', 'hold_time', 'latency_time', 'flight_time', 'tremors', 'on_meds']
df[kept_columns].to_csv('keystroke_data.csv', index=False)

In [53]:
df[kept_columns].head()

Unnamed: 0,userkey,parkinsons,timestep,hand,direction,impact,hold_time,latency_time,flight_time,tremors,on_meds
0,0,True,0,L,LL,Severe,0.20642,0.292152,0.19466,True,True
1,0,True,1,L,LL,Severe,0.174523,0.546376,0.449235,True,True
2,0,True,2,L,LL,Severe,0.158675,0.262736,0.155427,True,True
3,0,True,3,L,LL,Severe,0.126981,0.448617,0.351341,True,True
4,0,True,4,S,LS,Severe,0.253962,0.233446,0.11632,True,True


In [47]:
users_df.groupby('parkinsons').userkey.nunique()

parkinsons
False     58
True     169
Name: userkey, dtype: int64