In [1]:
## Script to analyze Turbo Typing data

Notes on where I'm leaving off:   
- 9/22/24 Just wrote code that removes the extra spaces before characters from the imported data without deleting actual spaces. Next I want to use the collapsed sentences with no extra s
- spaces to use the diff function and compare them.  
- 9/23/2024 Basically same goal for next time. I had to make a new column that replaces the multi-character key names with special characters for the keyData from the participants, which was completed today.
- 9/24/2024 Isolated correct keypress times. Next time, calculate ttk for correct presses, mean ttk, and the rest of the assessment variables. Also ask Patrick the best way to organize and write code.

In [2]:
import os
import numpy as np
import pandas as pd
import math
import glob
import editdistance
from scipy import stats
import diff_match_patch as dmp_module
dmp = dmp_module.diff_match_patch()
import copy

In [3]:
# The imported data from Turbo Typing stores listed data as a long string. 
# This splits it up into actual lists.
def str_to_list(dataframe, columns):
    for col in columns:
        data_str = dataframe[col]
        if type(data_str.iloc[0]) == list:
            pass
        elif type(data_str.iloc[0]) == str:
            if data_str.iloc[0][0].isdigit() == True:
                data_list = data_str.apply(lambda trial: 
                                           [int(char) for char in trial.split(',')])
                dataframe[col] = data_list
            elif data_str.iloc[0][0].isdigit() == False:
                data_list = [x.split(',') for x in data_str]
                dataframe[col] = data_list
        else:
            print('Column must have data in string form.')

# Removes extra spaces before characters in keypress data.
def no_extra_spaces(dataframe, columns):
    for col in columns:
        no_space = dataframe[col].apply(lambda trial: 
                                        [char.replace(" ","") if char.isspace() 
                                         != True else ' ' for char in trial])
        dataframe[col] = no_space

# Changes multi-chracter keynames (ie. LeftShift and Backspace) to special
# characters.
def multi_to_special(dataframe, column):
    data = dataframe[column]
    edit_data = []
    for trial in data:
        edit_trial = copy.deepcopy(trial) # Pandas can't deep copy lists in dataframes, so this is a work around.
        for index, key in enumerate(edit_trial):
            if len(key) == 1:
                pass
            if len(key) > 1:
                if key in ['LeftShift', 'RightShift']:
                    edit_trial[index] = '#'
                elif key == 'Backspace':
                    edit_trial[index] = '-'
                elif key == 'Return':
                    edit_trial[index] = '^'
                else:
                    edit_trial[index] = '~' # '~' replaces any other keys longer than one character.
        edit_data.append(edit_trial)
    return edit_data

# Calculates time to keypress (TTK) for all presses in one trial.
# TTK is the reaction time for the first press and the interkey 
# interval for any subsequent presses.
def ttk(row, column):
    presses = pd.DataFrame(row.loc[column])
    ttk_df = presses.sub(presses.shift(fill_value=0))
    ttk_list = ttk_df.values.flatten().tolist()
    return ttk_list

# Creates a list of keys that would have been pressed if the sentence was typed
# with no errors (ie. no backspacing). Uses single, special characters to 
# represent multi-character key names (ie. '^' for 'Return').
def sentence_to_keydata(string):
    curr_keyData = []
    for index, data in enumerate(string):
        if data.isupper() == True:
            curr_keyData.append('#') # '#' Replaces 'Right/LeftShift'
            curr_keyData.append(data)
        elif data.isupper() == False and data.isalpha() or data.isspace() == True:
            curr_keyData.append(data)
        elif data == '?':
            curr_keyData.append('#')
            curr_keyData.append(data)
            curr_keyData.append('^')
        elif data == '.':
            curr_keyData.append(data)
            curr_keyData.append('^') # '^' Replaces 'Return'
    return curr_keyData

# # Splits up keyDataConverted data from a string to a list of pressed keys.
# def split_str(row):
#     keys_str = row.loc['keyDataConverted']
#     keys_list = keys_str.split(',')
#     return keys_list

In [4]:
# Setting save directory
save_dir = os.getcwd()
save_dir

'/Users/rubi/Desktop/Github/typingexp/typing_task_analysis'

In [5]:
# Getting subject data folders from server.
server = r'/Volumes/greenhouse/typingtask_data/subject_data'
os.chdir(server)
folders = os.listdir()

# Looping through subject folders, getting appropriate paths to data, 
# and making sID list.
sub_folders = list(filter(lambda x: x.startswith('s', 0, 1), folders))
all_turbo = pd.DataFrame()
ID_list = []
for sub in sub_folders:
    sub_folder = r'/Volumes/greenhouse/typingtask_data/'\
                 'subject_data/%s/turbotyping_data' % sub
    os.chdir(sub_folder)
    sID = sub.split('_', 1)[0]
    turbo = pd.read_csv(glob.glob('*_datafile.tsv')[0], sep='\t')
    all_turbo = pd.concat([all_turbo, turbo])
    ID_list.append(sID)

all_turbo = all_turbo.reset_index()

In [6]:
# Transforms listed data stored in long strings to actual lists.
str_to_list(all_turbo, ['timeData', 
                        'keyData', 
                        'keyDataConverted', 
                        'timeDataUp',
                        'keyDataUp'])

# Removes extra spaces before characters in keypress data columns.
no_extra_spaces(all_turbo, ['keyData', 'keyDataConverted', 'keyDataUp'])

# Replaces multi-character key names with single, special characters from key
# press data typed by participants. 
no_multichar_keys = multi_to_special(all_turbo, 'keyDataConverted')
all_turbo.insert(11, 'keyDataSpecialChar', no_multichar_keys)

# Creates column that represents the correct key press sequence if there were
# no errors (uses special characters for mtuli-character key names).
current_keyData = [sentence_to_keydata(x) for x in all_turbo['currentSentence']]
all_turbo.insert(8, 'keyDataCurrent', current_keyData)

In [7]:
# Compares the correct sequence of keypresses with what was typed during the 
# trial and returns the difference between the two.
string_diff = all_turbo.apply(lambda row: 
                            dmp.diff_main(''.join(row.loc['keyDataSpecialChar']), 
                                          ''.join(row.loc['keyDataCurrent'])),
                            axis=1)

# Separates the returned differences into individual characters.
diffs_bychar = []
for trial in string_diff:
    diff_bychar_trial = []
    for diff in trial:
        if len(diff[1]) == 1:
            diff_bychar_trial.append([diff[0], diff[1]])
        elif len(diff[1]) > 1:
            for char in diff[1]:
                diff_bychar_trial.append([diff[0], char])
    diffs_bychar.append(diff_bychar_trial)

# Differences for errors where participants swap two letters are solved as: 
# letter 2 subtraction, letter 1 retention, letter 2 addition. Because  
# both letters 1 and 2 need to be identified as incorrect keypresses, this code 
# changes the difference to show: letter 2 subtraction, letter 1 subtraction.
diffs_bychar_fixswap = []
for trial in diffs_bychar:
    chardiffs_trial = []
    char_toskip = 0
    for index, char in enumerate(trial):
        if char_toskip != 0:
            char_toskip = char_toskip - 1
            continue
        if char[0] == 0:
            if index == (len(trial) - 1) or index == 0:
                chardiffs_trial.append(char)
            else:
                if trial[index - 1][0] == -1 and trial[index + 1][0] == 1:
                    if trial[index - 1][1] == trial[index + 1][1]:
                        chardiffs_trial.append([-1, char[1]])
                        char_toskip = 1
                else:
                    chardiffs_trial.append(char)
        else:
            chardiffs_trial.append(char)
    diffs_bychar_fixswap.append(chardiffs_trial)

In [8]:
corr_keys = []
corr_indices = []
for trial in diffs_bychar_fixswap:
    corr_keys_trial = []
    corr_indices_trial = []
    for index, char in enumerate(trial):
        if char[0] == 0:
            corr_keys_trial.append(char[1])
            corr_indices_trial.append(index)
        else:
            pass
    corr_keys.append(corr_keys_trial)
    corr_indices.append(corr_indices_trial)
    
all_turbo.insert(12, 'keyDataCorrect', corr_keys)
all_turbo.insert(13, 'keyIndicesCorrect', corr_indices)

In [9]:
# Isolates keypress times for correct keypresses.
def corr_time(trial):
    times = trial.loc['timeData']
    indices = trial.loc['keyIndicesCorrect']
    matched_indices =[]
    for index, time in enumerate(times):
        if index in indices:
            matched_indices.append(time)
    return matched_indices

# Isolates keypress times for correct keypresses and adds to a new column.
corr_times = all_turbo.apply(corr_time, axis=1)
all_turbo.insert(10, 'timeDataCorrect', corr_times)

In [10]:
# Calculates edit distance of each trial and stores to an additional column.
edit_dist = all_turbo.apply(lambda row: 
                            editdistance.eval(row.loc['currentSentence'], 
                                              row.loc['typedSentence']), 
                            axis=1)
all_turbo['editDistance'] = edit_dist

# Calculates TTK for every character typed in each trial and stores to 
# an additional column.
ttks = all_turbo.apply(lambda row: ttk(row, 'timeData'), axis=1)
all_turbo['ttk'] = ttks

# Calculates TTK for every character correctly typed in each trial and stores to 
# an additional column.
ttks_corr = all_turbo.apply(lambda row: ttk(row, 'timeDataCorrect'), axis=1)
all_turbo['ttkCorrect'] = ttks_corr

# Calculates mean TTK for each trial and stores to an additional column.
mean_ttk = [np.mean(x) for x in all_turbo['ttk']]
all_turbo['ttkMean'] = mean_ttk

# Calculates mean correct TTK for each trial and stores to an additional column.
mean_ttk_corr = [np.mean(x) for x in all_turbo['ttkCorrect']]
all_turbo['ttkMeanCorrect'] = mean_ttk_corr

# Calculates sum of all TTKs for each trial and stores to an additional column.
sum_ttk = [np.sum(x) for x in all_turbo['ttk']]
all_turbo['ttkSum'] = sum_ttk
all_turbo['ttkMean']

# Calculates sum of all correct TTKs for each trial and stores to an additional 
# column.
sum_ttk_corr = [np.sum(x) for x in all_turbo['ttkCorrect']]
all_turbo['ttkSumCorrect'] = sum_ttk_corr

In [22]:
# Calculates IES
def ies(row):
    mean_corr_rt = row['ttkMeanCorrect']
    pc = len(row['ttkCorrect'])/len(row['ttk'])
    return mean_corr_rt/pc

ies = all_turbo.apply(lambda row: ies(row), 
                      axis=1)
all_turbo['ies'] = ies

In [23]:
all_turbo

Unnamed: 0,index,participantID,trialNumber,currentSentence,typedSentence,trialStart,wpm,accuracy,keyDataCurrent,timeData,...,ttk,ttkCorrect,ttkMean,ttkMeanCorrect,ttkSum,ttkSumCorrect,ies,rcs,lisas,bis
0,0,262,-2,Looking for some gift hints from you.,Looking for some gift hints from you.,1/23/2024 9:17:58 AM,86,100,"[#, L, o, o, k, i, n, g, , f, o, r, , s, o, ...","[21454, 21906, 22156, 22284, 22371, 22611, 226...",...,"[21454, 452, 250, 128, 87, 240, 17, 120, 79, 1...","[21454, 452, 250, 128, 87, 240, 17, 120, 79, 1...",651.195122,684.589744,26699,26699,719.696910,0.001461,684.589744,
1,1,262,-1,I can't tell you when they'll be back.,I can't tell you when they'll be back.,1/23/2024 9:18:28 AM,63,100,"[#, I, , c, a, n, t, , t, e, l, l, , y, o, ...","[22442, 22633, 22968, 23041, 23498, 23809, 239...",...,"[22442, 191, 335, 73, 457, 311, 136, 104, 57, ...","[22442, 191, 1312, 104, 57, 135, 441, 95, 201,...",665.681818,770.789474,29290,29290,892.493075,0.001297,770.789474,
2,2,262,0,Our forces are ready to come immediately.,Our forces are ready to come immedaitely.,1/23/2024 9:19:04 AM,80,95,"[#, O, u, r, , f, o, r, c, e, s, , a, r, e, ...","[927, 1118, 1311, 1494, 1631, 1718, 1854, 1903...",...,"[927, 191, 193, 183, 137, 87, 136, 49, 103, 19...","[1118, 193, 183, 137, 87, 136, 49, 103, 192, 1...",147.954545,158.780488,6510,6510,170.398572,0.006298,158.780488,
3,3,262,1,A bad thing has been turned into a good thing.,A bad thing has been turned into a good thing.,1/23/2024 9:19:12 AM,108,100,"[#, A, , b, a, d, , t, h, i, n, g, , h, a, ...","[586, 857, 970, 1146, 1281, 1346, 1458, 1554, ...",...,"[586, 271, 113, 176, 135, 65, 112, 96, 80, 72,...","[586, 271, 113, 176, 135, 65, 112, 96, 80, 72,...",110.541667,110.541667,5306,5306,110.541667,0.009046,,
4,4,262,2,Do you want to eat lunch somewhere before?,Do you want to eat lunch somewhere before?,1/23/2024 9:19:19 AM,98,100,"[#, D, o, , y, o, u, , w, a, n, t, , t, o, ...","[530, 730, 858, 962, 1065, 1130, 1194, 1266, 1...",...,"[530, 200, 128, 104, 103, 65, 64, 72, 95, 65, ...","[530, 200, 128, 104, 103, 65, 64, 72, 95, 65, ...",114.533333,114.533333,5154,5154,114.533333,0.008731,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,12,s313_11152024,10,I am snowed under and out of the office.,I am snowed under and out of the office.,11/15/2024 3:45:26 PM,52,100,"[#, I, , a, m, , s, n, o, w, e, d, , u, n, ...","[666, 747, 795, 1026, 1163, 1587, 2539, 2588, ...",...,"[666, 81, 48, 231, 137, 424, 952, 49, 151, 152...","[666, 81, 48, 231, 137, 424, 952, 49, 151, 152...",198.347826,217.238095,9124,9124,237.927438,0.004603,217.238095,
421,13,s313_11152024,11,She started singing along softly with the radio.,She started singing along softly with the radio.,11/15/2024 3:45:36 PM,70,100,"[#, S, h, e, , s, t, a, r, t, e, d, , s, i, ...","[665, 841, 961, 1057, 1178, 1458, 1601, 1810, ...",...,"[665, 176, 120, 96, 121, 280, 143, 209, 55, 19...","[665, 176, 120, 96, 121, 280, 143, 209, 55, 19...",167.240000,167.240000,8362,8362,167.240000,0.005979,,
422,14,s313_11152024,12,It must be accompanied by increased productivity.,It must be accompanied by increased productivity.,11/15/2024 3:45:46 PM,64,100,"[#, I, t, , m, u, s, t, , b, e, , a, c, c, ...","[610, 714, 1170, 1266, 1426, 1594, 1706, 1906,...",...,"[610, 104, 456, 96, 160, 168, 112, 200, 128, 3...","[610, 104, 456, 96, 160, 168, 112, 200, 128, 3...",190.176471,190.176471,9699,9699,190.176471,0.005258,,
423,15,s313_11152024,13,When it is used we try and do as many paths as...,When it is used we try and do as many paths as...,11/15/2024 3:45:58 PM,60,100,"[#, W, h, e, n, , i, t, , i, s, , u, s, e, ...","[578, 786, 946, 1066, 1194, 1282, 1418, 1538, ...",...,"[578, 208, 160, 120, 128, 88, 136, 120, 128, 5...","[578, 208, 160, 120, 128, 88, 136, 120, 128, 5...",192.183333,198.810345,11531,11531,205.665874,0.005030,198.810345,


In [12]:
# Calculates RCS
rcs = all_turbo.apply(lambda row: 
                      len(row.loc['ttkCorrect'])/row.loc['ttkSum'],
                      axis=1)
all_turbo['rcs'] = rcs

In [13]:
# Calculates LISAS
def lisas(row):
    rt_mean = row.loc['ttkMeanCorrect']
    rt_std = np.std(row.loc['ttkCorrect'])
    pe = 1-(len(row['ttkCorrect'])/len(row['ttk']))
    pe_std = np.sqrt(pe*(1-pe))
    return rt_mean + (np.std(rt_mean)/pe_std)*pe

lisas_output = all_turbo.apply(lambda row: lisas(row), axis=1)
all_turbo['lisas'] = lisas_output

  return rt_mean + (np.std(rt_mean)/pe_std)*pe


In [14]:
len(ID_list)

25

In [15]:
# Calculates BIS
# by_sub = all_turbo.groupby('participantID')
z_score = stats.zscore
# by_sub = pd.DataFrame(all_turbo.groupby('participantID').filter(lambda x: print(x['trialNumber'])))
ttkmean_bysub = pd.DataFrame(all_turbo[all_turbo['trialNumber'] >= 0].groupby('participantID').ttkMean)
# totalttkmean_bysub = ttkmean_bysub
# by_sub = by_sub.reset_index().rename(columns = {'ttk':'all_ttk'})
# ttkmean_bysub
# test = ttkmean_bysub.
# test = ttkmean_bysub[1][0].mean()
# test
test = ttkmean_bysub[1].apply(lambda x: stats.zscore(x))
# stats.zscore(ttkmean_bysub[1][0])
# ttkmean_bysub
# by_sub[1][0]
# test
# by_sub['all_ttk'].apply(stats.zscore)
test = test.transpose()
test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
19,0.828666,,,,,,,,,,...,,,,,,,,,,
20,0.829338,,,,,,,,,,...,,,,,,,,,,
21,-0.530023,,,,,,,,,,...,,,,,,,,,,
22,-0.076468,,,,,,,,,,...,,,,,,,,,,
23,-0.066036,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,,,,,,,,,,,...,,,,,,,,,,1.198170
421,,,,,,,,,,,...,,,,,,,,,,-1.167157
422,,,,,,,,,,,...,,,,,,,,,,0.576849
423,,,,,,,,,,,...,,,,,,,,,,0.729444


In [16]:
all_turbo['bis'] = np.nan
all_turbo['bis'][0]

nan

In [17]:
test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
19,0.828666,,,,,,,,,,...,,,,,,,,,,
20,0.829338,,,,,,,,,,...,,,,,,,,,,
21,-0.530023,,,,,,,,,,...,,,,,,,,,,
22,-0.076468,,,,,,,,,,...,,,,,,,,,,
23,-0.066036,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,,,,,,,,,,,...,,,,,,,,,,1.198170
421,,,,,,,,,,,...,,,,,,,,,,-1.167157
422,,,,,,,,,,,...,,,,,,,,,,0.576849
423,,,,,,,,,,,...,,,,,,,,,,0.729444


In [18]:
for row in test.index: 
    bis_val = test[row]
    print(bis_val)
    # all_turbo.loc['bis'][row] = bis_val 

19    NaN
20    NaN
21    NaN
22    NaN
23    NaN
       ..
420   NaN
421   NaN
422   NaN
423   NaN
424   NaN
Name: 19, Length: 375, dtype: float64
19    NaN
20    NaN
21    NaN
22    NaN
23    NaN
       ..
420   NaN
421   NaN
422   NaN
423   NaN
424   NaN
Name: 20, Length: 375, dtype: float64
19    NaN
20    NaN
21    NaN
22    NaN
23    NaN
       ..
420   NaN
421   NaN
422   NaN
423   NaN
424   NaN
Name: 21, Length: 375, dtype: float64
19    NaN
20    NaN
21    NaN
22    NaN
23    NaN
       ..
420   NaN
421   NaN
422   NaN
423   NaN
424   NaN
Name: 22, Length: 375, dtype: float64
19    NaN
20    NaN
21    NaN
22    NaN
23    NaN
       ..
420   NaN
421   NaN
422   NaN
423   NaN
424   NaN
Name: 23, Length: 375, dtype: float64
19          NaN
20          NaN
21          NaN
22          NaN
23          NaN
         ...   
420    1.198170
421   -1.167157
422    0.576849
423    0.729444
424   -0.735151
Name: 24, Length: 375, dtype: float64


KeyError: 25

In [None]:
subject_scores = pd.DataFrame()
subject_scores['sID'] = ID_list
subject_scores['sID'].sort_values().reset_index(drop = True)

In [None]:
# Calculates rate correct score (RCS) for each trail and stores in column.
rcs = all_turbo.apply(lambda row: 
                      row.loc['numCorr']/row.loc['ttkSum'],
                      axis=1)
all_turbo['rcs'] = rcs

# Calculates linear integrated speed-ccuracy score (LISAS) and stores in column.
# lisas = 

In [None]:



# for trial in data_str:
#     data_list = [int(x) for x in trial.split(',')]
# data_list
# for trial in data_list:
#     data_int = 
# data_list[0]
# data_int = [print(x) for x in data_list]
# data_int

# edit_dist = all_turbo.apply(lambda row: 
#                             editdistance.eval(row.loc['currentSentence'], 
#                             row.loc['typedSentence']), 
#                             axis=1)