In [1]:
import numpy as np
import pandas as pd
import ast
import math
import matplotlib.pyplot as plt
from scipy.stats import variation
import glob
import os
import typingmod as typ

In [2]:
## mounting to ION server
# os.system("osascript -e 'mount volume \"smb://ion-nas.uoregon.edu\" \
#           as user name \"greenhouse\" with password \"password\"'")

In [3]:
## defining function to organize bigrams into rows
def bigram_byrow():
    bigrams = []
    for index, row in keys_intocolumns.iterrows():
        for column in range(0, (len(keys_intocolumns.columns) - 1)):
            if (keys_intocolumns[column][index] != None and float('nan')) and (keys_intocolumns[column + 1][index] != None and float('nan')):
                bigram = (keys_intocolumns[column][index] + keys_intocolumns[column + 1][index])
                bigram = (bigram.replace("'", "")).replace(" ", "")
                iki = (main_df['key_resp.rt.%(second)d' % {'second':  column + 2 }][index] - main_df['key_resp.rt.%(first)d' % { 'first': column +1 }][index])
                bigrams.append([index, column, bigram, iki, main_df['string'][index], main_df['resp_string'][index]])
    return(bigrams)

## defining function that separates words in to bigrams
def bi_byword(word):
    bi_results = []
    for y in range(0, (len(word)-1)):
        bigram = word[y] + word[y+1]
        bi_results.append(bigram)
    return bi_results

## defining function that separates all words into bigrams
def bi_allwords():
    bigrams = []
    for word in df['string']:
        bigrams.append(bi_byword(word))
    return bigrams

In [4]:
## create dataframes tiral-based and bigram-based dataframes for each subject ##

## importing experiment data
server = r'/Volumes/greenhouse/typingtask_data/subject_data'
server_noturbo = r'/Volumes/greenhouse/typingtask_data/subject_data/not_used/no_turbotyping/'
os.chdir(server)
folders = os.listdir()

# looping through subjects
sub_folders = list(filter(lambda x: x.startswith('s', 0, 1), folders))
for sub in sub_folders:
    sub_folder = r'/Volumes/greenhouse/typingtask_data/subject_data/%s/psychopy_data/' % sub
    os.chdir(sub_folder)
    sID = sub.split('_', 1)[0]
    og_df = pd.read_csv(glob.glob('*.csv')[0])   

## filters through subjects without turbotyping data
# sub_folders = list(filter(lambda x: x.startswith('s', 0, 1), folders))
# for sub in sub_folders:
#     sub_folder = server_noturbo + r'%s/psychopy_data/' % sub
#     os.chdir(sub_folder)
#     sID = sub.split('_', 1)[0]
#     og_df = pd.read_csv(glob.glob('*.csv')[0])  
   
    ## deleting first 3 practice trials -- EDIT FOR ANY TRIALS YOU WANT TO IMMEDIATELY EXCLUDE
    df = (og_df.drop(labels=[0, 1, 2], axis=0)).reset_index(drop = True) 
    
    ## expanding nested key_resp.rt values into separate columns, making new dataframe, and turning values back into floats from strings
    stripped_rts_1 = ((df['key_resp_1.rt'].str.strip('[,]')).dropna()).str.split(',', expand = True)
    stripped_rts_2 = ((df['key_resp_2.rt'].str.strip('[,]')).dropna()).str.split(',', expand = True)
    rts_intocolumns = (pd.concat([stripped_rts_1, stripped_rts_2])).reset_index(drop = True)
    
    ## renames rt columns to automatically match dataset
    DF = rts_intocolumns
    renamed_rt = DF.rename(columns = { 0:'key_resp.rt.%s' %(0+1) })
    for n in range(0, len(DF.columns)):
        renamed_rt = renamed_rt.rename(columns = { n:'key_resp.rt.%s' %(n+1) })
    expanded_rts = renamed_rt.astype(float).fillna(0) ##replacing NaNs with zeroes

    ## expanding nested key_resp.keys values into separate columns and making new dataframe
    stripped_keys_1 = ((df['key_resp_1.keys'].str.strip('[,]')).dropna()).str.split(',', expand = True)
    stripped_keys_2 = ((df['key_resp_2.keys'].str.strip('[,]')).dropna()).str.split(',', expand = True)
    keys_intocolumns = (pd.concat([stripped_keys_1, stripped_keys_2])).reset_index(drop = True)
    keys_intocolumns = keys_intocolumns.where(pd.notnull(keys_intocolumns), None) 
        # ^ also replaces any added NaNs with Nones

    ## renames key columns to automatically match dataset
    DF = keys_intocolumns
    expanded_keys = DF.rename(columns = { 0:'key_resp.keys.%s' %(0+1) })
    for n in range(0, len(DF.columns)):
        expanded_keys = expanded_keys.rename(columns = { n:'key_resp.keys.%s' %(n+1) })

    ## getting rid of apostrophes and spaces in key values
    cols_to_change = (expanded_keys.iloc[:, 0:])
    for col in cols_to_change:
        expanded_keys[col] = expanded_keys[col].str.replace("'", "")
        expanded_keys[col] = expanded_keys[col].str.replace(" ", "")

    ## combining key_resp.keys into one simple string to easily represent typed responses
    responses_1 = pd.DataFrame((df['key_resp_1.keys'].str.replace("[', ]", "", regex=True).str.strip("[]")).dropna()).rename(columns = {'key_resp_1.keys':'resp_string'})
    responses_2 = pd.DataFrame((df['key_resp_2.keys'].str.replace("[', ]", "", regex=True).str.strip("[]")).dropna()).rename(columns = {'key_resp_2.keys':'resp_string'})
    responses = (pd.concat([responses_1, responses_2])).reset_index(drop = True)

    ## identifying bigrams in words to add to larger dataframe
    task_bigrams = pd.DataFrame(bi_allwords())
    task_bigrams.columns = ['bi_1', 'bi_2', 'bi_3', 'bi_4']
    
    ## combining expanded rt, expanded keys, and response string values with column for strings typed each trial to create more useful dataframe
    ## (does not have all the random timing data of other events occuring during the task)
    main_df = pd.concat([responses, task_bigrams, expanded_keys, expanded_rts], axis = 1)
    main_df.insert(0, 'string', df['string'], True)

    ## creating column for WF type for each trial
    main_df['wf_type'] = ""
    for index, data in main_df.iterrows():
        if main_df.loc[index, 'string'] in typ.highwf:
            main_df.loc[index, 'wf_type'] = 'highwf'
        if main_df.loc[index, 'string'] in typ.medwf:
            main_df.loc[index, 'wf_type'] = 'medwf'
        if main_df.loc[index, 'string'] in typ.lowwf:
            main_df.loc[index, 'wf_type'] = 'lowwf'
        if main_df.loc[index, 'string'] in typ.pseudo:
            main_df.loc[index, 'wf_type'] = 'pseudo'

    ## creating column for BF type for each trial
    main_df['meanbf_type'] = ""
    for index, data in main_df.iterrows():
        if main_df.loc[index, 'string'] in typ.avg_highbf:
            main_df.loc[index, 'meanbf_type'] = 'highbf'
        if main_df.loc[index, 'string'] in typ.avg_medbf:
            main_df.loc[index, 'meanbf_type'] = 'medbf'
        if main_df.loc[index, 'string'] in typ.avg_lowbf:
            main_df.loc[index, 'meanbf_type'] = 'lowbf'

    ## creating column for trial (useful for group analysis)
    trial_nums = []
    for index, data in main_df.iterrows():
        trial_nums.append(index)
    main_df.insert(0, 'trial_num', trial_nums)

    ## creating column for subject ID (also useful for group analysis)
    main_ID = [sID]*len(main_df)
    main_df.insert(0, 'sID', main_ID)

    ## creating columns for word repetition number
    main_df.insert(2, 'rep_num', '')
    main_df['rep_num'] = main_df.groupby(['sID', 'string']).cumcount()

    ## creating column for if trial is correct or not
    main_df['trial_corr'] = ''
    corr_trials = (main_df[main_df.string 
                   == main_df.resp_string])
    corr_indices = list(corr_trials.index.values)
    main_df.loc[corr_indices, 'trial_corr'] = "corr"
    
    incorr_trials = (main_df[main_df.string 
                     != main_df.resp_string])
    incorr_indices = list(incorr_trials.index.values)
    main_df.loc[incorr_indices, 'trial_corr'] = "incorr"
    
    ## making csv from dataframe
    edited_path = os.path.join(sub_folder, 'edited')
    if os.path.exists(edited_path) == False:
        os.mkdir(edited_path)
    bytrial_path = os.path.join(edited_path, '%s_bytrial.csv' % sID)
    print(bytrial_path)
    main_df.to_csv(bytrial_path)

    
    ## BIGRAM DATAFRAME ##
    bigram_df = (pd.DataFrame(bigram_byrow())).rename(columns={0: "trial_num", 1: "bigram_loc",  2:"resp_bigram", 3: "IKI", 4: "string", 5: "resp_string"})

    ## creating column for bigram # (useful for group analysis)
    bigram_nums = []
    for index, data in bigram_df.iterrows():
        bigram_nums.append(index)
    bigram_df.insert(0, 'bigram_num', bigram_nums)

    ## creating column for subject ID (also useful for group analysis)
    bigram_ID = [sID]*len(bigram_df)
    bigram_df.insert(0, 'sID', bigram_ID)

    ## creating column for correct bigram (as opposed to the typed bigram)
    bigram_df.insert(4, 'bigram', '')
    for index, row in bigram_df.iterrows():
        loc = bigram_df.loc[index, 'bigram_loc']
        loc_list = [0, 1, 2, 3]
        if loc in loc_list:
            corr = bi_byword(bigram_df.loc[index, 'string'])[loc]
        else:
            corr = ''
        bigram_df.loc[index, 'bigram'] = corr

    ## creating column for rep #
    bigram_df.insert(3, 'rep_num', '')
    bigram_df['rep_num'] = bigram_df.groupby(['sID', 'string', 'bigram']).cumcount()
    
    ## creating column for bigram frequency
    bg_freqs = pd.read_csv(r'/Users/rubi/Desktop/Github/typingexp/typing_task_analysis/bg_freqs.csv') ## EDIT TO MAKE USEFUL ON OTHER COMPUTERS
    bg_freqs.drop(columns = ['Unnamed: 0'], inplace = True)
    freq_dict = bg_freqs.set_index('Bigrams')['Frequency'].to_dict()
    bigram_df['bg_freq'] = bigram_df['bigram'].map(freq_dict)

    ## creating column for bigram type
    name_list = ['high', 'med', 'low', 'pseudo']

    for index, bf_type in enumerate(typ.bf_types):
        by_bf = bigram_df[bigram_df.bigram.isin(bf_type)]
        rows = by_bf.index
        bigram_df.loc[rows, 'bf_type'] = name_list[index]

    ## creating a column for mean bigram type
    for index, avgbf_type in enumerate(typ.avgbf_types):
        by_bf = bigram_df[bigram_df.string.isin(avgbf_type)]
        rows = by_bf.index
        bigram_df.loc[rows, 'meanbf_type'] = name_list[index]

    ## creating a column for mean bigram type
    for index, wf_type in enumerate(typ.wf_types):
        by_wf = bigram_df[bigram_df.string.isin(wf_type)]
        rows = by_wf.index
        bigram_df.loc[rows, 'wf_type'] = name_list[index]

    ## creating column for if trial is correct or not
    bigram_df['trial_corr'] = ''
    corr_trials_bybg = (bigram_df[bigram_df.string 
                   == bigram_df.resp_string])
    corr_indices_bybg = list(corr_trials_bybg.index.values)
    bigram_df.loc[corr_indices_bybg, 'trial_corr'] = "corr"
    
    incorr_trials_bybg = (bigram_df[bigram_df.string 
                     != bigram_df.resp_string])
    incorr_indices_bybg = list(incorr_trials_bybg.index.values)
    bigram_df.loc[incorr_indices_bybg, 'trial_corr'] = "incorr"

    ## creating column for if bigram is correct or not
    bigram_df['bg_corr'] = ''
    corr_bgs = (bigram_df[bigram_df.bigram 
                   == bigram_df.resp_bigram])
    corr_bg_indices = list(corr_bgs.index.values)
    bigram_df.loc[corr_bg_indices, 'bg_corr'] = "corr"
    
    incorr_bgs = (bigram_df[bigram_df.bigram 
                     != bigram_df.resp_bigram])
    incorr_bg_indices = list(incorr_bgs.index.values)
    bigram_df.loc[incorr_bg_indices, 'bg_corr'] = "incorr"

    ## making csv from dataframe
    bybigram_path = os.path.join(edited_path, '%s_bybigram.csv' % sID)
    print(bybigram_path)
    bigram_df.to_csv(bybigram_path)

/Volumes/greenhouse/typingtask_data/subject_data/s262_01232024/psychopy_data/edited/s262_bytrial.csv
/Volumes/greenhouse/typingtask_data/subject_data/s262_01232024/psychopy_data/edited/s262_bybigram.csv
/Volumes/greenhouse/typingtask_data/subject_data/s261_12122023/psychopy_data/edited/s261_bytrial.csv
/Volumes/greenhouse/typingtask_data/subject_data/s261_12122023/psychopy_data/edited/s261_bybigram.csv
/Volumes/greenhouse/typingtask_data/subject_data/s240_11162023/psychopy_data/edited/s240_bytrial.csv
/Volumes/greenhouse/typingtask_data/subject_data/s240_11162023/psychopy_data/edited/s240_bybigram.csv
/Volumes/greenhouse/typingtask_data/subject_data/s217_11092023/psychopy_data/edited/s217_bytrial.csv
/Volumes/greenhouse/typingtask_data/subject_data/s217_11092023/psychopy_data/edited/s217_bybigram.csv
/Volumes/greenhouse/typingtask_data/subject_data/s176_10262023/psychopy_data/edited/s176_bytrial.csv
/Volumes/greenhouse/typingtask_data/subject_data/s176_10262023/psychopy_data/edited/s17

In [5]:
main_df

# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can also be specified
#     print(main_df)

Unnamed: 0,sID,trial_num,rep_num,string,resp_string,bi_1,bi_2,bi_3,bi_4,key_resp.keys.1,...,key_resp.rt.1,key_resp.rt.2,key_resp.rt.3,key_resp.rt.4,key_resp.rt.5,key_resp.rt.6,wf_type,meanbf_type,trial_corr,corr_type
0,s305,0,0,think,think,th,hi,in,nk,t,...,0.777044,1.009041,1.113209,1.273128,1.417325,0.0,highwf,highbf,,corr
1,s305,1,0,haole,haole,ha,ao,ol,le,h,...,0.973728,1.165631,1.333673,1.533691,1.654069,0.0,lowwf,medbf,,corr
2,s305,2,0,edthe,edthe,ed,dt,th,he,e,...,0.913808,1.105976,1.281843,1.417973,1.521869,0.0,pseudo,highbf,,corr
3,s305,3,0,belly,belly,be,el,ll,ly,b,...,0.661679,0.837608,1.005634,1.141670,1.605719,0.0,medwf,medbf,,corr
4,s305,4,0,cheer,cheer,ch,he,ee,er,c,...,0.626029,0.753881,0.857857,1.001862,1.121864,0.0,medwf,highbf,,corr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,s305,235,9,cheer,cheer,ch,he,ee,er,c,...,0.612286,0.740279,0.804288,0.956597,1.068277,0.0,medwf,highbf,,corr
236,s305,236,9,haole,haole,ha,ao,ol,le,h,...,0.512195,0.608155,0.689170,0.864152,0.976113,0.0,lowwf,medbf,,corr
237,s305,237,9,theme,theme,th,he,em,me,t,...,0.500097,0.588131,0.684550,0.796235,0.852109,0.0,medwf,highbf,,corr
238,s305,238,9,about,about,ab,bo,ou,ut,a,...,0.512195,0.632218,0.784196,0.888196,1.032283,0.0,highwf,medbf,,corr


In [6]:
bigram_df

# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can also be specified
#     print(bigram_df)

Unnamed: 0,sID,bigram_num,trial_num,rep_num,bigram_loc,bigram,resp_bigram,IKI,string,resp_string,bg_freq,bf_type,meanbf_type,wf_type,trial_corr,bg_corr
0,s305,0,0,0,0,th,th,0.231997,think,think,22288309.0,high,high,high,corr,corr
1,s305,1,0,0,1,hi,hi,0.104168,think,think,6198006.0,high,high,high,corr,corr
2,s305,2,0,0,2,in,in,0.159919,think,think,13597302.0,high,high,high,corr,corr
3,s305,3,0,0,3,nk,nk,0.144196,think,think,445067.0,low,high,high,corr,corr
4,s305,4,1,0,0,ha,ha,0.191903,haole,haole,6967591.0,high,med,low,corr,corr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
953,s305,953,238,9,3,ut,ut,0.144087,about,about,3257233.0,med,med,high,corr,corr
954,s305,954,239,9,0,lu,lu,0.151135,lucky,lucky,568081.0,med,low,high,corr,corr
955,s305,955,239,9,1,uc,uc,0.152134,lucky,lucky,891233.0,med,low,high,corr,corr
956,s305,956,239,9,2,ck,ck,0.103892,lucky,lucky,925655.0,med,low,high,corr,corr
