In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
rows = []
colNames = ['BirthYear','Gender','Parkinsons','Tremors','DiagnosisYear','Sided','UPDRS','Impact','Levadopa','DA','MAOB','Other']
users = []

userFolder = 'data/Users'

for userFilename in os.listdir(userFolder):
    users.append(userFilename[5:-4])

    f = open(os.path.join(userFolder, userFilename))
    lines = f.readlines()

    row = []
    col = []

    for line in lines:
        line = line[:-1]
        lineSplit = line.split(': ')
        
        row.append(lineSplit[1])
        col.append(lineSplit[0])
            
    if col == colNames:
        rows.append(row)
    else:
        print(userFilename+'read wrong.')

    f.close()

userDF = pd.DataFrame(rows, columns=colNames, index=users)
userDF.head()

Unnamed: 0,BirthYear,Gender,Parkinsons,Tremors,DiagnosisYear,Sided,UPDRS,Impact,Levadopa,DA,MAOB,Other
0EA27ICBLF,1952,Female,True,True,2000,Left,Don't know,Severe,True,True,False,False
0QAZFRHQHW,1959,Female,False,False,------,,Don't know,------,False,False,False,False
0WTDIGPSBZ,1946,Female,False,False,------,,Don't know,------,False,False,False,False
1HOEBIGASW,1944,Male,False,False,------,,Don't know,------,False,False,False,False
1WMVCCU4RH,1953,Male,True,True,2017,Left,Don't know,Medium,False,False,False,False


In [3]:
rows = []
colNames = ['UserID','Date','Timestamp','Hand','HoldTime','Direction','LatencyTime','FlightTime']

dataFolder = 'data/TappyData'
invalid = [0,0,0,0,0,0,0,0,0]
for dataFilename in os.listdir(dataFolder):
    infoArr = dataFilename[:-4].split('_')
    userID = infoArr[0]
    yearMonth = infoArr[1]

    f = open(os.path.join(dataFolder, dataFilename))
    lines = f.readlines()

    row = []
    
    for idx, line in enumerate(lines):
        line = line[:-1]
        lineSplit = line.split('\t')

        if len(line) != 57:
            invalid[0] = invalid[0] + 1
        elif len(lineSplit[0]) != 10 or lineSplit[0] != userID:
            invalid[1] = invalid[1] + 1
        elif len(lineSplit[1]) != 6 or lineSplit[1][0:4] != yearMonth:
            invalid[2] = invalid[2] + 1
        elif len(lineSplit[2]) != 12: # Timestamp
            invalid[3] = invalid[3] + 1
        elif len(lineSplit[3]) != 1: # Hand
            invalid[4] = invalid[4] + 1
        elif len(lineSplit[4]) != 6: # HoldTime
            invalid[5] = invalid[5] + 1
        elif len(lineSplit[5]) != 2: # Direction
            invalid[6] = invalid[6] + 1
        elif len(lineSplit[6]) != 6: # LatencyTime
            invalid[7] = invalid[7] + 1
        elif len(lineSplit[7]) != 6: # FlightTime
            invalid[8] = invalid[8] + 1
        else:
            rows.append(lineSplit[:-1])
    
    f.close()
print(invalid)
dataDF = pd.DataFrame(rows, columns=colNames)
dataDF.head()

[1354, 0, 0, 7, 0, 0, 0, 0, 0]


Unnamed: 0,UserID,Date,Timestamp,Hand,HoldTime,Direction,LatencyTime,FlightTime
0,0EA27ICBLF,160722,18:41:04.336,L,101.6,LL,234.4,156.3
1,0EA27ICBLF,160722,18:42:14.070,L,85.9,LL,437.5,359.4
2,0EA27ICBLF,160722,18:42:14.273,L,78.1,LL,210.9,125.0
3,0EA27ICBLF,160722,18:42:14.617,L,62.5,LL,359.4,281.3
4,0EA27ICBLF,160722,18:42:15.586,S,125.0,LS,187.5,93.8


In [4]:
df = pd.merge(dataDF,userDF[['Parkinsons']], left_on='UserID', right_index=True, how='left')
df['Parkinsons'] = df['Parkinsons'].map({'False':0, 'True':1})
df = df.dropna()

In [5]:
positiveSamples = df[df['Parkinsons']==1]
negativeSamples = df[df['Parkinsons']==0]
positiveGrouped = positiveSamples.groupby('UserID')
negativeGrouped = negativeSamples.groupby('UserID')
positiveUserIDs = np.array(positiveSamples['UserID'].unique())
negativeUserIDs = np.array(negativeSamples['UserID'].unique())
np.random.shuffle(positiveUserIDs)
np.random.shuffle(negativeUserIDs)

train_ratio = 0.7
valid_ratio = 0.2

total_positive_rows = len(positiveSamples)
train_positive_rows = int(total_positive_rows * train_ratio)
valid_positive_rows = int(total_positive_rows * valid_ratio)
total_negative_rows = len(negativeSamples)
train_negative_rows = int(total_negative_rows * train_ratio)
valid_negative_rows = int(total_negative_rows * valid_ratio)

# Initialize empty DataFrames for training, validation, and testing sets
train_df = pd.DataFrame(columns=df.columns)
valid_df = pd.DataFrame(columns=df.columns)
test_df = pd.DataFrame(columns=df.columns)

train_positive_count, valid_positive_count = 0, 0
train_negative_count, valid_negative_count = 0, 0
train_data = []
valid_data = []
test_data = []

# Iterate over the shuffled user_ids and assign rows to the corresponding set
for userID in positiveUserIDs:
    user_data = positiveGrouped.get_group(userID)
    user_rows = len(user_data)
    
    if train_positive_count + user_rows <= train_positive_rows:
        train_data.append(user_data)
        train_positive_count += user_rows
    elif valid_positive_count + user_rows <= valid_positive_rows:
        valid_data.append(user_data)
        valid_positive_count += user_rows
    else:
        test_data.append(user_data)

for userID in negativeUserIDs:
    user_data = negativeGrouped.get_group(userID)
    user_rows = len(user_data)
    
    if train_negative_count + user_rows <= train_negative_rows:
        train_data.append(user_data)
        train_negative_count += user_rows
    elif valid_negative_count + user_rows <= valid_negative_rows:
        valid_data.append(user_data)
        valid_negative_count += user_rows
    else:
        test_data.append(user_data)

train_df = pd.concat(train_data)
valid_df = pd.concat(valid_data)
test_df = pd.concat(test_data)
valid_test_df = pd.concat([valid_df, test_df]).reset_index(drop=True)

In [6]:
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

feature_column = 'HoldTime'
label_column = 'Parkinsons'
X_train = train_df[[feature_column]]
y_train = train_df[label_column]

X_test = valid_test_df[[feature_column]]
y_test = valid_test_df[label_column]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svm = LinearSVC()
svm.fit(X_train_scaled, y_train)

y_test_pred = svm.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test accuracy: {test_accuracy:.4f}')



Test accuracy: 0.7073
