In [1]:
import numpy as np
import pandas as pd
import ast
import math
import matplotlib.pyplot as plt
from scipy.stats import variation
import glob
import os
import typingmod as typ

In [2]:
## mounting to ION server
# os.system("osascript -e 'mount volume \"smb://ion-nas.uoregon.edu\" \
#           as user name \"greenhouse\" with password \"password\"'")

In [3]:
## defining function to organize bigrams into rows
def bigram_byrow():
    bigrams = []
    for index, row in keys_intocolumns.iterrows():
        for column in range(0, (len(keys_intocolumns.columns) - 1)):
            if (keys_intocolumns[column][index] != None and float('nan')) and (keys_intocolumns[column + 1][index] != None and float('nan')):
                bigram = (keys_intocolumns[column][index] + keys_intocolumns[column + 1][index])
                bigram = (bigram.replace("'", "")).replace(" ", "")
                iki = (main_df['key_resp.rt.%(second)d' % {'second':  column + 2 }][index] - main_df['key_resp.rt.%(first)d' % { 'first': column +1 }][index])
                bigrams.append([index, column, bigram, iki, main_df['string'][index], main_df['resp_string'][index]])
    return(bigrams)

## defining function that separates words in to bigrams
def bi_byword(word):
    bi_results = []
    for y in range(0, (len(word)-1)):
        bigram = word[y] + word[y+1]
        bi_results.append(bigram)
    return bi_results

## defining function that separates all words into bigrams
def bi_allwords():
    bigrams = []
    for word in df['string']:
        bigrams.append(bi_byword(word))
    return bigrams

In [4]:
## create dataframes tiral-based and bigram-based dataframes for each subject ##

## importing experiment data
server = r'/Volumes/greenhouse/typingtask_data/subject_data'
server_noturbo = r'/Volumes/greenhouse/typingtask_data/subject_data/not_used/no_turbotyping/'
os.chdir(server)
folders = os.listdir()

# looping through subjects
sub_folders = list(filter(lambda x: x.startswith('s', 0, 1), folders))
for sub in sub_folders:
    sub_folder = r'/Volumes/greenhouse/typingtask_data/subject_data/%s/psychopy_data/' % sub
    os.chdir(sub_folder)
    sID = sub.split('_', 1)[0]
    og_df = pd.read_csv(glob.glob('*.csv')[0])   

## filters through subjects without turbotyping data
# sub_folders = list(filter(lambda x: x.startswith('s', 0, 1), folders))
# for sub in sub_folders:
#     sub_folder = server_noturbo + r'%s/psychopy_data/' % sub
#     os.chdir(sub_folder)
#     sID = sub.split('_', 1)[0]
#     og_df = pd.read_csv(glob.glob('*.csv')[0])  
   
    ## deleting first 3 practice trials -- EDIT FOR ANY TRIALS YOU WANT TO IMMEDIATELY EXCLUDE
    df = (og_df.drop(labels=[0, 1, 2], axis=0)).reset_index(drop = True) 
    
    ## expanding nested key_resp.rt values into separate columns, making new dataframe, and turning values back into floats from strings
    stripped_rts_1 = ((df['key_resp_1.rt'].str.strip('[,]')).dropna()).str.split(',', expand = True)
    stripped_rts_2 = ((df['key_resp_2.rt'].str.strip('[,]')).dropna()).str.split(',', expand = True)
    rts_intocolumns = (pd.concat([stripped_rts_1, stripped_rts_2])).reset_index(drop = True)
    
    ## renames rt columns to automatically match dataset
    DF = rts_intocolumns
    renamed_rt = DF.rename(columns = { 0:'key_resp.rt.%s' %(0+1) })
    for n in range(0, len(DF.columns)):
        renamed_rt = renamed_rt.rename(columns = { n:'key_resp.rt.%s' %(n+1) })
    expanded_rts = renamed_rt.astype(float).fillna(0) ##replacing NaNs with zeroes

    ## expanding nested key_resp.keys values into separate columns and making new dataframe
    stripped_keys_1 = ((df['key_resp_1.keys'].str.strip('[,]')).dropna()).str.split(',', expand = True)
    stripped_keys_2 = ((df['key_resp_2.keys'].str.strip('[,]')).dropna()).str.split(',', expand = True)
    keys_intocolumns = (pd.concat([stripped_keys_1, stripped_keys_2])).reset_index(drop = True)
    keys_intocolumns = keys_intocolumns.where(pd.notnull(keys_intocolumns), None) 
        # ^ also replaces any added NaNs with Nones

    ## renames key columns to automatically match dataset
    DF = keys_intocolumns
    expanded_keys = DF.rename(columns = { 0:'key_resp.keys.%s' %(0+1) })
    for n in range(0, len(DF.columns)):
        expanded_keys = expanded_keys.rename(columns = { n:'key_resp.keys.%s' %(n+1) })

    ## getting rid of apostrophes and spaces in key values
    cols_to_change = (expanded_keys.iloc[:, 0:])
    for col in cols_to_change:
        expanded_keys[col] = expanded_keys[col].str.replace("'", "")
        expanded_keys[col] = expanded_keys[col].str.replace(" ", "")

    ## combining key_resp.keys into one simple string to easily represent typed responses
    responses_1 = pd.DataFrame((df['key_resp_1.keys'].str.replace("[', ]", "", regex=True).str.strip("[]")).dropna()).rename(columns = {'key_resp_1.keys':'resp_string'})
    responses_2 = pd.DataFrame((df['key_resp_2.keys'].str.replace("[', ]", "", regex=True).str.strip("[]")).dropna()).rename(columns = {'key_resp_2.keys':'resp_string'})
    responses = (pd.concat([responses_1, responses_2])).reset_index(drop = True)

    ## identifying bigrams in words to add to larger dataframe
    task_bigrams = pd.DataFrame(bi_allwords())
    task_bigrams.columns = ['bi_1', 'bi_2', 'bi_3', 'bi_4']
    
    ## combining expanded rt, expanded keys, and response string values with column for strings typed each trial to create more useful dataframe
    ## (does not have all the random timing data of other events occuring during the task)
    main_df = pd.concat([responses, task_bigrams, expanded_keys, expanded_rts], axis = 1)
    main_df.insert(0, 'string', df['string'], True)

    ## creating column for WF type for each trial
    main_df['wf_type'] = ""
    for index, data in main_df.iterrows():
        if main_df.loc[index, 'string'] in typ.highwf:
            main_df.loc[index, 'wf_type'] = 'highwf'
        if main_df.loc[index, 'string'] in typ.medwf:
            main_df.loc[index, 'wf_type'] = 'medwf'
        if main_df.loc[index, 'string'] in typ.lowwf:
            main_df.loc[index, 'wf_type'] = 'lowwf'
        if main_df.loc[index, 'string'] in typ.pseudo:
            main_df.loc[index, 'wf_type'] = 'pseudo'

    ## creating column for BF type for each trial
    main_df['meanbf_type'] = ""
    for index, data in main_df.iterrows():
        if main_df.loc[index, 'string'] in typ.avg_highbf:
            main_df.loc[index, 'meanbf_type'] = 'highbf'
        if main_df.loc[index, 'string'] in typ.avg_medbf:
            main_df.loc[index, 'meanbf_type'] = 'medbf'
        if main_df.loc[index, 'string'] in typ.avg_lowbf:
            main_df.loc[index, 'meanbf_type'] = 'lowbf'

    ## creating column for trial (useful for group analysis)
    trial_nums = []
    for index, data in main_df.iterrows():
        trial_nums.append(index)
    main_df.insert(0, 'trial_num', trial_nums)

    ## creating column for subject ID (also useful for group analysis)
    main_ID = [sID]*len(main_df)
    main_df.insert(0, 'sID', main_ID)

    ## creating columns for word repetition number
    main_df.insert(2, 'rep_num', '')
    main_df['rep_num'] = main_df.groupby(['sID', 'string']).cumcount()
    
    ## making csv from dataframe
    edited_path = os.path.join(sub_folder, 'edited')
    if os.path.exists(edited_path) == False:
        os.mkdir(edited_path)
    bytrial_path = os.path.join(edited_path, '%s_bytrial.csv' % sID)
    print(bytrial_path)
    main_df.to_csv(bytrial_path)



    
    ## BIGRAM DATAFRAME ##
    bigram_df = (pd.DataFrame(bigram_byrow())).rename(columns={0: "trial_num", 1: "bigram_loc",  2:"resp_bigram", 3: "IKI", 4: "string", 5: "resp_string"})

    ## creating column for bigram # (useful for group analysis)
    bigram_nums = []
    for index, data in bigram_df.iterrows():
        bigram_nums.append(index)
    bigram_df.insert(0, 'bigram_num', bigram_nums)

    ## creating column for subject ID (also useful for group analysis)
    bigram_ID = [sID]*len(bigram_df)
    bigram_df.insert(0, 'sID', bigram_ID)

    ## creating column for correct bigram (as opposed to the typed bigram)
    bigram_df.insert(4, 'bigram', '')
    for index, row in bigram_df.iterrows():
        loc = bigram_df.loc[index, 'bigram_loc']
        loc_list = [0, 1, 2, 3]
        if loc in loc_list:
            corr = bi_byword(bigram_df.loc[index, 'string'])[loc]
        else:
            corr = ''
        bigram_df.loc[index, 'bigram'] = corr

    ## creating column for rep #
    bigram_df.insert(3, 'rep_num', '')
    bigram_df['rep_num'] = bigram_df.groupby(['sID', 'string', 'bigram']).cumcount()
    
    ## creating column for bigram frequency
    bg_freqs = pd.read_csv(r'/Users/rubi/Desktop/Github/typingexp/typing_task_analysis/bg_freqs.csv') ## EDIT TO MAKE USEFUL ON OTHER COMPUTERS
    bg_freqs.drop(columns = ['Unnamed: 0'], inplace = True)
    freq_dict = bg_freqs.set_index('Bigrams')['Frequency'].to_dict()
    bigram_df['bg_freq'] = bigram_df['bigram'].map(freq_dict)

    ## creating column for bigram type
    name_list = ['high', 'med', 'low', 'pseudo']

    for index, bf_type in enumerate(typ.bf_types):
        by_bf = bigram_df[bigram_df.bigram.isin(bf_type)]
        rows = by_bf.index
        bigram_df.loc[rows, 'bf_type'] = name_list[index]

    ## creating a column for mean bigram type
    for index, avgbf_type in enumerate(typ.avgbf_types):
        by_bf = bigram_df[bigram_df.string.isin(avgbf_type)]
        rows = by_bf.index
        bigram_df.loc[rows, 'meanbf_type'] = name_list[index]

    ## creating a column for mean bigram type
    for index, wf_type in enumerate(typ.wf_types):
        by_wf = bigram_df[bigram_df.string.isin(wf_type)]
        rows = by_wf.index
        bigram_df.loc[rows, 'wf_type'] = name_list[index]

    ## making csv from dataframe
    bybigram_path = os.path.join(edited_path, '%s_bybigram.csv' % sID)
    print(bybigram_path)
    bigram_df.to_csv(bybigram_path)

/Volumes/greenhouse/typingtask_data/subject_data/not_used/no_turbotyping/s20_09012022/psychopy_data/edited/s20_bytrial.csv
/Volumes/greenhouse/typingtask_data/subject_data/not_used/no_turbotyping/s20_09012022/psychopy_data/edited/s20_bybigram.csv
/Volumes/greenhouse/typingtask_data/subject_data/not_used/no_turbotyping/s175_08032022/psychopy_data/edited/s175_bytrial.csv
/Volumes/greenhouse/typingtask_data/subject_data/not_used/no_turbotyping/s175_08032022/psychopy_data/edited/s175_bybigram.csv
/Volumes/greenhouse/typingtask_data/subject_data/not_used/no_turbotyping/s178_08302022/psychopy_data/edited/s178_bytrial.csv
/Volumes/greenhouse/typingtask_data/subject_data/not_used/no_turbotyping/s178_08302022/psychopy_data/edited/s178_bybigram.csv
/Volumes/greenhouse/typingtask_data/subject_data/not_used/no_turbotyping/s180_10102022/psychopy_data/edited/s180_bytrial.csv
/Volumes/greenhouse/typingtask_data/subject_data/not_used/no_turbotyping/s180_10102022/psychopy_data/edited/s180_bybigram.csv


In [5]:
main_df

# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can also be specified
#     print(main_df)

Unnamed: 0,sID,trial_num,rep_num,string,resp_string,bi_1,bi_2,bi_3,bi_4,key_resp.keys.1,...,key_resp.keys.7,key_resp.rt.1,key_resp.rt.2,key_resp.rt.3,key_resp.rt.4,key_resp.rt.5,key_resp.rt.6,key_resp.rt.7,wf_type,meanbf_type
0,s217,0,0,would,would,wo,ou,ul,ld,w,...,,0.723308,0.827496,0.915393,1.051286,1.091396,0.0,0.0,highwf,medbf
1,s217,1,0,vodka,vodka,vo,od,dk,ka,v,...,,1.165449,1.285454,1.405452,1.525459,1.589452,0.0,0.0,medwf,lowbf
2,s217,2,0,kremp,kremp,kr,re,em,mp,k,...,,1.033510,1.113516,1.201451,1.249445,1.385430,0.0,0.0,pseudo,medbf
3,s217,3,0,theme,theme,th,he,em,me,t,...,,0.741364,0.837475,0.877338,1.029368,1.093365,0.0,0.0,medwf,highbf
4,s217,4,0,druze,druze,dr,ru,uz,ze,d,...,,0.793336,0.985552,1.153500,1.241438,1.385341,0.0,0.0,lowwf,lowbf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,s217,235,9,cheer,cheer,ch,he,ee,er,c,...,,0.605839,0.749859,0.845765,1.021899,1.101868,0.0,0.0,medwf,highbf
236,s217,236,9,heond,heond,he,eo,on,nd,h,...,,0.770076,0.882104,0.970067,1.074068,1.201988,0.0,0.0,pseudo,highbf
237,s217,237,9,lucky,lucky,lu,uc,ck,ky,l,...,,0.670279,0.894252,1.054194,1.198233,1.430445,0.0,0.0,highwf,lowbf
238,s217,238,9,about,about,ab,bo,ou,ut,a,...,,0.681711,0.801726,1.081651,1.193738,1.337723,0.0,0.0,highwf,medbf


In [6]:
bigram_df

# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can also be specified
#     print(bigram_df)

Unnamed: 0,sID,bigram_num,trial_num,rep_num,bigram_loc,bigram,resp_bigram,IKI,string,resp_string,bg_freq,bf_type,meanbf_type,wf_type
0,s217,0,0,0,0,wo,wo,0.104188,would,would,1723496.0,med,med,high
1,s217,1,0,0,1,ou,ou,0.087897,would,would,7425307.0,high,med,high
2,s217,2,0,0,2,ul,ul,0.135893,would,would,2181271.0,med,med,high
3,s217,3,0,0,3,ld,ld,0.040110,would,would,2012500.0,med,med,high
4,s217,4,1,0,0,vo,vo,0.120005,vodka,vodka,368238.0,low,low,med
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
960,s217,960,238,9,3,ut,ut,0.143985,about,about,3257233.0,med,med,high
961,s217,961,239,9,0,ch,ch,0.095976,champ,champ,3267507.0,med,med,med
962,s217,962,239,9,1,ha,ha,0.104018,champ,champ,6967591.0,high,med,med
963,s217,963,239,9,2,am,am,0.127965,champ,champ,1610395.0,med,med,med
