# ***Two-Step Task - Parse data***

**Main:**
* This is a code to parse the data of the two-step task as adapted from the Experiment Factory (Sochat et al., 2016) and implemented in the Habit App study by Gera et al. (it was implemented as a post-experiment task aftercompleting the game with the app).
* The code is based on the task data as downloaded from JATOS as a txt file.
* This code is sturctured to work using google colab. 
* It is designed to use the txt data file which be placed in the Google Drive 
> This code was written by Rani Gera, last editted on November 2022


#General settings

In [1]:
#@title Load Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [25]:
#@title Import packages
import os
import re
import numpy as np
import pandas as pd
!pip install simplejson
import simplejson
import json


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [17]:
#@title Parameter definition { form-width: "5%" }
subRange = [100,400]

# paths
# -----------
# read:
main_path = '/content/drive/MyDrive/Experiments/HAS_STUDY/HAS_Analysis/data/MBMF'
dataFileName = 'MBMF_raw_data.txt'
full_path = os.path.join(main_path, dataFileName)
# write:
logisticRegDataFileName = 'MBMF_dataForLogisticReg.csv'
RL_model_DataFileName = 'MBMF_data_for_RL_model.csv'

#Assemble Data

In [13]:
#@title Get data { form-width: "5%" }

FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)

def grabJSON(s):
    """Takes the largest bite of JSON from the string.
       Returns (object_parsed, remaining_string)
    """
    decoder = simplejson.JSONDecoder()
    obj, end = decoder.raw_decode(s)
    end = WHITESPACE.match(s, end).end()
    return obj, s[end:]

def getDataAsArrayOfDicts():
    arrayOfDicts = []
    with open(full_path) as f:
        s = f.read()

    while True:
        obj, remaining = grabJSON(s)
        arrayOfDicts.append(obj)
        s = remaining
        if not remaining.strip():
            break
            
    return arrayOfDicts

MBMF_raw_data=getDataAsArrayOfDicts()

In [16]:
#@title Create a dictionary and a data frame { form-width: "5%" }
# Replacing worker IDs in and making it a dict
MBMF_Dict = {}
for subData in MBMF_raw_data:
  try:
    MBMF_Dict[int(subData[list(subData.keys())[0]][0]['subID'])] = subData[list(subData.keys())[0]]
  except:
    pass

# Creating a DF of all data
MBMF_DF = pd.DataFrame()
MBMF_meta = {}
for key, val in sorted(MBMF_Dict.items()):
  oneSubDF = pd.DataFrame(val)
  oneSubDF.subID = key
  if 'experimentCompleted' in oneSubDF.columns and key in range(subRange[0],subRange[1]): # *key is in fact the subject id
    # extract meta stuff and post experiment questions
    MBMF_meta[key] = val[0]
    MBMF_meta[key]['post_task_Qs'] = json.loads(oneSubDF.responses[oneSubDF.trial_id == "post task questions"].iloc[0])
    MBMF_meta[key]['post_task_Qs']['rt'] = oneSubDF[oneSubDF.trial_id == "post task questions"].rt.iloc[0]
    MBMF_meta[key]['endSlide_rt'] = oneSubDF[oneSubDF.trial_id == "end"].rt.iloc[0]
    MBMF_meta[key]['credit_var'] = oneSubDF[oneSubDF.trial_id == "end"].credit_var.iloc[0]
    MBMF_meta[key]['performance_var'] = oneSubDF[oneSubDF.trial_id == "end"].performance_var.iloc[0]
    MBMF_meta[key]['focusShiftExceptInInstructions'] = oneSubDF[(oneSubDF.focus_shifts != 0) & (oneSubDF.trial_id != 'instruction')].empty
    oneSubDF = oneSubDF[oneSubDF.experimentCompleted != True] # move the first row (which is meta stuff)
    oneSubDF = oneSubDF[oneSubDF.trial_id != "post task questions"]
    oneSubDF = oneSubDF[oneSubDF.trial_id != "end"]

    # create the DF
    MBMF_DF = MBMF_DF.append(oneSubDF)

In [18]:
#@title Cleaning and assembling the data
# -----------------------------
# keep only trials of the test
MBMF_DF = MBMF_DF[(MBMF_DF.trial_num.notna()) & (MBMF_DF.exp_stage != 'practice')]
# remove unecessary columns
MBMF_DF = MBMF_DF.drop(['experimentCompleted', 'keptFullScreen','credit_var', 'performance_var', 'text', 'timing_post_trial', 'trial_type', 'internal_node_id',
              'exp_id','full_screen', 'focus_shifts',	'view_history', 'possible_responses', 'exp_stage', 'responses', 'coins_collected'], axis=1)

# changing the key press names
MBMF_DF.loc[MBMF_DF.trial_id =='feedback_stage', 'key_press'] = None
MBMF_DF.loc[(MBMF_DF.key_press !=39) & (MBMF_DF.key_press !=37)&(MBMF_DF.trial_id =='feedback_stage'), 'key_press'] = 'no_response' # change -1 to 'no_response in trial with no response
MBMF_DF.loc[(MBMF_DF.key_press ==37) & ((MBMF_DF.trial_id =='first_stage') | (MBMF_DF.trial_id =='second_stage')), 'key_press'] = 'left'
MBMF_DF.loc[(MBMF_DF.key_press ==39) & ((MBMF_DF.trial_id =='first_stage') | (MBMF_DF.trial_id =='second_stage')), 'key_press'] = 'right'


In [19]:
#@title Print some general info
print('N =',len(MBMF_DF.subID.unique()))
print('Participant list:',MBMF_DF.subID.unique())
print('Dataframe structure:')
MBMF_DF.head()

N = 130
Participant list: [101 102 103 105 106 107 108 111 114 115 116 117 119 123 124 125 127 128
 130 131 133 134 135 136 137 138 142 143 146 147 153 154 155 156 157 158
 159 160 161 162 163 164 165 203 204 206 207 208 210 211 212 213 216 218
 219 223 225 226 228 229 230 232 233 236 237 239 240 241 242 244 245 246
 248 250 251 252 253 256 257 258 259 260 261 262 263 264 265 304 305 306
 308 309 312 315 316 318 319 321 325 326 327 332 335 336 338 343 344 346
 349 354 356 358 359 360 361 363 364 366 367 368 371 372 373 374 375 377
 378 380 381 382]
Dataframe structure:


Unnamed: 0,subID,rt,key_press,block_duration,trial_id,trial_index,time_elapsed,stimulus,stim_duration,trial_num,stim_order,stage,stim_selected,stage_transition,feedback,FB_probs
305,101,1344.0,right,1344.0,first_stage,304.0,419382.0,<div class = decision-left style='background:#...,1344.0,0.0,"[1, 0]",0.0,0.0,,,
307,101,824.0,left,824.0,second_stage,306.0,421209.0,<div class = 'decision-top faded' style='backg...,824.0,0.0,"[3, 2]",1.0,3.0,frequent,,
309,101,-1.0,no_response,500.0,feedback_stage,308.0,422717.0,<div class = 'decision-top faded' style='backg...,500.0,0.0,,,,,0.0,"[0.5551259260413833, 0.45057687258959533, 0.53..."
311,101,775.0,left,775.0,first_stage,310.0,424504.0,<div class = decision-left style='background:#...,775.0,1.0,"[0, 1]",0.0,0.0,,,
313,101,1042.0,left,1042.0,second_stage,312.0,426550.0,<div class = 'decision-top faded' style='backg...,1042.0,1.0,"[3, 2]",1.0,3.0,frequent,,


In [20]:
#@title Exclude participants (according to performance criteria){ form-width: "5%" }
"""
Based on Gillan et al. (2015):
"Participants were excluded (but still paid) if they missed more than 10 % of the trials (n = 18),
had implausibly fast reaction times (i.e., ±2 SDs from the mean; n = 2),
or responded with the same key on more than 90 % of trials (n = 1)."
"""
MBMF_DF_after_exclusions = MBMF_DF.copy()
MBMF_DF_after_exclusions = MBMF_DF_after_exclusions.reset_index(drop=True)

# missed trials
for sub in MBMF_DF_after_exclusions.subID.unique():
  if len(MBMF_DF_after_exclusions[(MBMF_DF_after_exclusions.subID==sub) & (MBMF_DF_after_exclusions.trial_id=='second_stage') & ((MBMF_DF_after_exclusions.stim_selected.isna()) | (MBMF_DF_after_exclusions.stim_selected == -1))]) > 20:
    MBMF_DF_after_exclusions = MBMF_DF_after_exclusions[MBMF_DF_after_exclusions.subID != sub]
    print('>>> EXCLUDING sub', sub, '- has more than 10% missed trials')

# RT
meanRT_perSub = MBMF_DF_after_exclusions[MBMF_DF_after_exclusions.stim_selected>-1].groupby(['subID']).mean().rt
meanRT = MBMF_DF_after_exclusions[MBMF_DF_after_exclusions.stim_selected>-1].groupby(['subID']).mean().mean().rt
STD_RT_perSub = MBMF_DF_after_exclusions[MBMF_DF_after_exclusions.stim_selected>-1].groupby(['subID']).mean().rt.std()
toRemove = meanRT_perSub[((meanRT_perSub - meanRT) / STD_RT_perSub) < -2]
toRemove = toRemove.reset_index().subID.unique()
print('>>> EXCLUDING subjects', toRemove, '- RT implausibly fast (-2SD)')
MBMF_DF_after_exclusions = MBMF_DF_after_exclusions[MBMF_DF_after_exclusions['subID'].apply(lambda x: x not in toRemove)]

# press the same key
proportionPressRightOutOfValidPresses = MBMF_DF_after_exclusions[MBMF_DF_after_exclusions.key_press=='right'].groupby('subID').count().key_press / MBMF_DF_after_exclusions[(MBMF_DF_after_exclusions.key_press=='right') | (MBMF_DF_after_exclusions.key_press=='left')].groupby('subID').count().key_press
toRemove2 = proportionPressRightOutOfValidPresses[(proportionPressRightOutOfValidPresses<0.1) | (proportionPressRightOutOfValidPresses>0.9)].reset_index().subID.unique()
print('>>> EXCLUDING subjects', toRemove2, '- more than 90% pressing on the same key.')
MBMF_DF_after_exclusions = MBMF_DF_after_exclusions[MBMF_DF_after_exclusions['subID'].apply(lambda x: x not in toRemove2)]



>>> EXCLUDING sub 103 - has more than 10% missed trials
>>> EXCLUDING sub 203 - has more than 10% missed trials
>>> EXCLUDING subjects [102 240] - RT implausibly fast (-2SD)
>>> EXCLUDING subjects [] - more than 90% pressing on the same key.


In [21]:
#@title Assemble the data in a table for the logistics regression & RL model in r (and stan) { form-width: "5%" }
"""
Note:
Trials with no choice in the first stage are useless and thus completely removed.
I also create a version without the trials with a miss in the 2nd stage.
"""

MBMF_DF_for_RL_Model = MBMF_DF_after_exclusions.copy()

MBMF_DF_for_RL_Model=MBMF_DF_for_RL_Model.reset_index(drop=True)
MBMF_DF_for_RL_Model['stimOrder_Stage2'] = None
MBMF_DF_for_RL_Model['stimOrder_Stage2'] = MBMF_DF_for_RL_Model['stimOrder_Stage2'].astype('object')
for sub in MBMF_DF_for_RL_Model.subID.unique():
  for trial in MBMF_DF_for_RL_Model[MBMF_DF_for_RL_Model.subID==sub].trial_num.unique():
    # Constract one line with the relevant data for each trial for each subject:
    MBMF_DF_for_RL_Model.loc[(MBMF_DF_for_RL_Model.subID==sub) & (MBMF_DF_for_RL_Model.trial_num==trial), 'transition'] = MBMF_DF_for_RL_Model[(MBMF_DF_for_RL_Model.subID==sub) & (MBMF_DF_for_RL_Model.trial_num==trial) & (MBMF_DF_for_RL_Model.stage_transition.notna())].stage_transition.iloc[0]
    MBMF_DF_for_RL_Model.loc[(MBMF_DF_for_RL_Model.subID==sub) & (MBMF_DF_for_RL_Model.trial_num==trial),'stimOrder_Stage2'] = MBMF_DF_for_RL_Model.loc[(MBMF_DF_for_RL_Model.subID==sub) & (MBMF_DF_for_RL_Model.trial_num==trial),'stimOrder_Stage2'].apply(lambda x: MBMF_DF_for_RL_Model[(MBMF_DF_for_RL_Model.subID==sub) & (MBMF_DF_for_RL_Model.trial_num==trial)].stim_order.iloc[1])
    MBMF_DF_for_RL_Model.loc[(MBMF_DF_for_RL_Model.subID==sub) & (MBMF_DF_for_RL_Model.trial_num==trial), 'action_Stage2'] = MBMF_DF_for_RL_Model[(MBMF_DF_for_RL_Model.subID==sub) & (MBMF_DF_for_RL_Model.trial_num==trial)].stim_selected.iloc[1]
    try: # the try is for the case where there is no 3rd line...
      MBMF_DF_for_RL_Model.loc[(MBMF_DF_for_RL_Model.subID==sub) & (MBMF_DF_for_RL_Model.trial_num==trial), 'feedback'] = MBMF_DF_for_RL_Model[(MBMF_DF_for_RL_Model.subID==sub) & (MBMF_DF_for_RL_Model.trial_num==trial)].feedback.iloc[2]
    except:
      pass
      
# # Leave only the first line for each trial:
MBMF_DF_for_RL_Model = MBMF_DF_for_RL_Model.groupby(['subID','trial_num']).first().reset_index()
# rename columns:
MBMF_DF_for_RL_Model = MBMF_DF_for_RL_Model.rename(columns={'stim_selected':'action_Stage1', 'stim_order':'stimOrder_Stage1', 'feedback':'reward'})
# extracting relevant columns
MBMF_DF_for_RL_Model = MBMF_DF_for_RL_Model.loc[:,['subID', 'trial_num', 'stimOrder_Stage1', 'action_Stage1','transition', 'stimOrder_Stage2', 'action_Stage2', 'reward']]
# Mistakenly when there is an action in stage 1 but not in 2 a reward is indicated. Here I fix this:
MBMF_DF_for_RL_Model.loc[MBMF_DF_for_RL_Model.action_Stage2==-1, 'reward'] = None
# Remove lines with no presses at all (neither in the first or the second stage)
MBMF_DF_for_RL_Model = MBMF_DF_for_RL_Model[MBMF_DF_for_RL_Model.action_Stage1!=-1]

# Create a version without the trials with a miss in the 2nd stage:
MBMF_DF_for_RL_Model_NoMissedTrials = MBMF_DF_for_RL_Model[(MBMF_DF_for_RL_Model.action_Stage2!=-1)&(MBMF_DF_for_RL_Model.action_Stage2.notna())]

## RL comp model data

In [22]:
#@title Assemble data for RL computational model in r (and stan) { form-width: "5%" }
def finalize_RL_model_table(DF_for_RL_Model):
  # get relevant columns:
  DF_for_RL_Model_FULL = DF_for_RL_Model.loc[:,['subID', 'trial_num', 'action_Stage1', 'action_Stage2', 'stimOrder_Stage2', 'reward']]
  # give a unique number to the two optional states in stage2 ([2,3]->2, [4,5]=>3)
  DF_for_RL_Model_FULL.loc[DF_for_RL_Model_FULL.stimOrder_Stage2.apply(lambda x: x[0] <=3),'stimOrder_Stage2'] = 2
  DF_for_RL_Model_FULL.loc[DF_for_RL_Model_FULL.stimOrder_Stage2.apply(lambda x: isinstance(x, list) and x[0] >=4),'stimOrder_Stage2'] = 3
  # set the second stage choice to 1 or 2 in each second stage
  DF_for_RL_Model_FULL.loc[DF_for_RL_Model_FULL.action_Stage2.apply(lambda x: x<=3),'action_Stage2'] -= 1
  DF_for_RL_Model_FULL.loc[DF_for_RL_Model_FULL.action_Stage2.apply(lambda x: x>=4),'action_Stage2'] -= 3
  # change trial numbers to be fro, 1 tp 200 (rather than 0 to 199):
  DF_for_RL_Model_FULL.trial_num += 1
  # change actions from 0 and 1, to 1 and 2:
  DF_for_RL_Model_FULL.action_Stage1 += 1
  # convert relevant variables to integers:
  DF_for_RL_Model_FULL = DF_for_RL_Model_FULL.astype({"trial_num": int, "action_Stage1": int, "action_Stage2": int})
  # change name of a column:
  DF_for_RL_Model_FULL = DF_for_RL_Model_FULL.rename(columns={'stimOrder_Stage2': 'state_Stage2'})
  return DF_for_RL_Model_FULL

MBMF_DF_for_RL_Model_FULL = finalize_RL_model_table(MBMF_DF_for_RL_Model)
MBMF_DF_for_RL_Model_NoMissedTrials_FULL = finalize_RL_model_table(MBMF_DF_for_RL_Model_NoMissedTrials)
MBMF_DF_for_RL_Model_NoMissedTrials_FULL.head()


Unnamed: 0,subID,trial_num,action_Stage1,action_Stage2,state_Stage2,reward
0,101,1,1,2,2,0.0
1,101,2,1,2,2,0.0
2,101,3,2,2,2,0.0
3,101,4,2,1,3,0.0
4,101,5,2,1,2,0.0


In [23]:
#@title save data for the RL computational model { form-width: "5%" }
MBMF_DF_for_RL_Model_NoMissedTrials_FULL.to_csv(os.path.join(main_path, RL_model_DataFileName), index=False)
print('Note: I use the version with no missed trials at all (not even only on the 2nd stage. For the a version keeping these save the data frame: MBMF_DF_for_RL_Model_FULL_XXX')

Note: I use the version with no missed trials at all (not even only on the 2nd stage. For the a version keeping these save the data frame: MBMF_DF_for_RL_Model_FULL_XXX


## Logistic regression model data

In [26]:
#@title Assemble the data for logistic regression { form-width: "5%" }

MBMF_DF_for_reg = MBMF_DF_for_RL_Model.copy()
MBMF_DF_for_reg=MBMF_DF_for_reg.reset_index(drop=True)

# put last trial relevant data in the same line
MBMF_DF_for_reg.loc[1:,'last_trial_reward'] = np.array(MBMF_DF_for_reg['reward'][:-1])
MBMF_DF_for_reg.loc[1:,'last_trial_transition_type'] = np.array(MBMF_DF_for_reg['transition'][:-1])
# define stay/switch
MBMF_DF_for_reg.loc[1:,'this_trial_stay_or_switch'] = np.array(MBMF_DF_for_reg.action_Stage1[1:].values == MBMF_DF_for_reg.action_Stage1[:-1].values)
MBMF_DF_for_reg.loc[MBMF_DF_for_reg.this_trial_stay_or_switch == True,'this_trial_stay_or_switch'] = 'stay'
MBMF_DF_for_reg.loc[MBMF_DF_for_reg.this_trial_stay_or_switch == False,'this_trial_stay_or_switch'] = 'switch'

# Create the final variables for the regression with encoded and carry the names as in the litrature:
MBMF_DF_for_reg.loc[MBMF_DF_for_reg.last_trial_reward==0,'last_trial_reward'] = -1
MBMF_DF_for_reg.last_trial_reward = MBMF_DF_for_reg.last_trial_reward.astype('float64') # so all element are in the same dtype (required to input to R later)

MBMF_DF_for_reg.loc[MBMF_DF_for_reg.last_trial_transition_type == 'frequent', 'Transition'] = 1
MBMF_DF_for_reg.loc[MBMF_DF_for_reg.last_trial_transition_type == 'infrequent', 'Transition'] = -1

MBMF_DF_for_reg.loc[MBMF_DF_for_reg.this_trial_stay_or_switch == 'stay', 'Stay'] = 1
MBMF_DF_for_reg.loc[MBMF_DF_for_reg.this_trial_stay_or_switch == 'switch', 'Stay'] = 0

# Remove first trial of each subject (its data is not relevant for the regression analyses)
MBMF_DF_for_reg = MBMF_DF_for_reg.loc[MBMF_DF_for_reg['subID'].duplicated()]

#Change column names
MBMF_DF_for_reg = MBMF_DF_for_reg.rename(columns={'last_trial_reward':'Reward', 'stim_order':'stimOrder_Stage1', 'feedback':'reward'})

# Get the final relevant data seperately
logisticRegModelData = MBMF_DF_for_reg[['subID', 'Reward', 'Transition', 'Stay']].reset_index(drop=True)
logisticRegModelData.head()


Unnamed: 0,subID,Reward,Transition,Stay
0,101,-1.0,1.0,1.0
1,101,-1.0,1.0,0.0
2,101,-1.0,-1.0,1.0
3,101,-1.0,1.0,1.0
4,101,-1.0,-1.0,0.0


In [27]:
#@title save data for logistic regression { form-width: "5%" }
logisticRegModelData.to_csv(os.path.join(main_path,logisticRegDataFileName))
