# Participant Exclusion and Replacement Pipeline

1. The counterbalancing.csv contains every json that needs to be sampled for the discrimination and verbal judgement experiment (includes all durations) 
    - The row number for each sequence corresponds to the url fragment used in the variables file uploaded to Mechanical Turk 
    - This file does not change, the variables files is updated to resample sequences that get excluded
2. First need to match up reported participant worker IDs to worker IDs reported in batch data 
    - All data files downloaded from the server need to be matched to a worker ID in batch data
    - Data files that do not have a matched worker ID are moved to a seperate folder and are not analyzed 
3. Participant exclusion criteron are pre-registered on OSF (link) - if the participant is excluded, the counterbalanced sequence needs to be replaced in the variables file 
4. All participants who have completed need to be excluded from completing future HITs (exclude_workers.csv)

In [32]:
import os
import json 
import pandas as pd 
import numpy as np 
import math

## Get all Worker IDs from Batch data 

In [51]:
batch_path = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/v2_depth_duration_MTurk/all_batch'


In [53]:
all_batch_worker_ids = []
for path in os.listdir(batch_path):   
    if 'csv' in path:
        batch_data = pd.read_csv(batch_path + '/' + path)
        batch_worker_ids = list(batch_data['WorkerId'])
        all_batch_worker_ids += batch_worker_ids
print(len(all_batch_worker_ids))
all_batch_worker_ids

27


['A1AS1H21O9896X',
 'AILT4SWETTAUL',
 'A3D9PJ37U7LCHE',
 'A2MV1R6YYOIO30',
 'A3C4WGE4SOGM1',
 'A3AP9VARTGQH3T',
 'A2GRA6N98ZPIQ1',
 'A2A3FW4JIDUNTB',
 'A3M7KQYOMDA2MA',
 'AZNIEFUIVB2H0',
 'A3O9NIH7BH537H',
 'AH7Z2M3KSQ4DW',
 'A23KAJRDVCVGOE',
 'A1J39RAV7TKEMF',
 'A3G5IPGLH1IIZN',
 'AX8NXTT8QMGHC',
 'A3EIV1GTJ3Z2OG',
 'A3J8UC84NM958L',
 'A22I3XU97YTTTD',
 'A3VRHT6CR7LYCX',
 'A2APG8MSLJ6G2K',
 'A1TXAY6PNBHSPN',
 'A2E6OBTUO5NN2I',
 'AJSOHKPWWDS3G',
 'A2JPO11US5Q4H4',
 'A2S7ZT3VZBU2UA',
 'A1YZDFPLENN1RA']

## Get all Worker IDs from Data Files 


In [54]:
datapath = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/v2_depth_duration_MTurk/data/125ms'

worker_ids_from_data = []

for file in os.listdir(datapath):
    if 'csv' in file:
        path = datapath + "/" + file
        df = pd.read_csv(path, index_col=None, header=0)
        worker_ids_from_data.append(df.workerId.unique()[0])

worker_ids_from_data

['A3J8UC84NM958L',
 'AX8NXTT8QMGHC',
 'A3EIV1GTJ3Z2OG',
 'A3O9NIH7BH537H',
 'AZNIEFUIVB2H0',
 'A23KAJRDVCVGOE',
 'A3G5IPGLH1IIZN',
 'A1J39RAV7TKEMF',
 'AH7Z2M3KSQ4DW']

In [55]:
set(worker_ids_from_data) == set(all_batch_worker_ids)

False

In [56]:
# get the worker IDs that are in the data but NOT in the batch data
# these data files should be moved to an archive and NOT analyzed
batchdata_workerIDs = set(all_batch_worker_ids)
missing_wid = [wid for wid in worker_ids_from_data if wid not in batchdata_workerIDs]
missing_wid

[]

# Participant Exclusion

In [87]:
def combineCSVs(datafolder):
    """
    Combine all participant data into one pandas df
    OR 
    Create df for single participant file 
    """
        
    #checks if path is a file
    isFile = os.path.isfile(datafolder)

    #checks if path is a directory
    isDirectory = os.path.isdir(datafolder)
    
    if isDirectory == True:
        data = []
        for filename in os.listdir(datafolder):
            if 'csv' in filename:
                path = datafolder + "/" + filename
                df = pd.read_csv(path, index_col=None, header=0)
                data.append(df)

        input_frame = pd.concat(data, axis=0, ignore_index=True)
        
    if isFile == True:
        if 'csv' in datafolder:
            input_frame = pd.read_csv(datafolder, index_col=None, header=0)
    
    print('Number of participants before cleaning: ', len(input_frame.subjID.unique()))
 
    return input_frame

def feet_to_meters(ft):
    """
    Args: ft = float value in feet 
        
    Returns: m = float value converted to meters 
    """
    m = ft * 0.3048
    return m

def getUnitConveredData(datafolder):
    '''
    Convert all responses given in feet to meters 
    '''
    input_data = combineCSVs(datafolder) # combine CSVs from all participants 
    
    for idx, row in input_data.iterrows():
        unit = row['unitSelection']
        # if estimate was made in feet, convert to meters 
        if unit == 'feet':
            estim_ft = row['depth_estimate']
            estim_m = feet_to_meters(estim_ft)
            # update depth estimates in existing dataframe
            input_data.at[idx, 'depth_estimate'] = estim_m
            # update units in existing dataframe
#             input_data.at[idx, 'unitSelection'] = 'meters'
    
    return input_data

def cleanAgeResponses(datafolder):
    '''
    Participants on MTurk must be over 18
    - If participants report they are < 18, exclude from analysis 
    
    NEED TO CHANGE HOW AGE IS COLLECTED
    
    '''
    input_data = getUnitConveredData(datafolder)
    exclude = []
    
    for idx, row in input_data.iterrows():
        age = row['age']
        # if year of birth was given, convert to age
#         if age > 1920:
#             actual_age = 2022-age
#             # update age in existing dataframe
#             input_data.at[idx, 'age'] = actual_age
        # participants must be over 18 so age reports below 18 are junk 
        if age < 18:
            # CHANGE THIS TO WORKER ID LATER 
            exclude.append(row['subjID'])
    print('Number of participants excluded due to age: ', len(exclude))
    return input_data, exclude 

In [121]:
data_path = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/v2_depth_duration_MTurk/data/125ms'

age_cleaned_data, exclude = cleanAgeResponses(data_path)

Number of participants before cleaning:  9
Number of participants excluded due to age:  0


In [122]:
def catchTrial_cleaning(df, correct_requirement, catch_stimuli, sequence_count, exclude):
    '''
    Participants complete 8 catch trials total to ensure that they are doing the task.
    If less than 7/8 catch trials are correct, the participant is excluded.  
    '''
    
    all_subjIDs = df.subjID.unique()
    remove = []
    subj_sequence = {}
    df2_list = []
    
    for subj in all_subjIDs:
#         print(subj)
        subj_df = df.loc[df['subjID'] == subj]
        cleaned_subj_df = subj_df.copy(deep=True) # prevent setting with copy warning
        subj_sequence[subj] = subj_df.sequenceName.unique()[0]
        
        count_correct = 0
        for idx, row in subj_df.iterrows():
            stim = row['stimulus']
            if type(stim) == str:
                if stim.split('/')[1] in catch_stimuli:
                    ####### VERSION WHERE CATCH TRIALS ARE ATTENTION CHECK: IMAGE HAS NO TARGET
#                     print(row['depth_estimate'])
#                     print(row['stimulus'])
                    if row["depth_estimate"] == 0:
                        count_correct += 1

                    # remove catch trial 
                    cleaned_subj_df.drop([idx], inplace=True)

        if count_correct < correct_requirement:
            remove.append(subj)
        else:
            sequence_count[subj_df.sequenceName.unique()[0]] += 1
        
        df2_list.append(cleaned_subj_df)
    
    df2 = pd.concat(df2_list)
    print("Number of participants that did not pass the catch trial check:", len(remove))
    print("Participants that were removed:",remove)

    for index, row in df2.iterrows():
        if row['subjID'] in remove:
            df2.drop(index, inplace=True)
    
    # add the list of participants to be removed to the existing list of excluded participants
    exclude += remove 
    
    return df2, exclude

In [123]:
sequences_path = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/v2_depth_duration_MTurk/jsons'
sequences_count_dict = {}
for seq in os.listdir(sequences_path):
    if 'json' in seq:
        sequences_count_dict['jsons/'+seq] = 0


In [124]:
all_catch_stim = ['000375_2014-06-08_11-17-29_260595134347_rgbf000133-resize_2',
                  '000569_2014-06-09_22-51-47_260595134347_rgbf000141-resize_3',
                  '000787_2014-06-08_22-33-53_260595134347_rgbf000175-resize_1',
                  '002072_2014-06-24_21-48-06_260595134347_rgbf000115-resize_0',
                  '001170_2014-06-17_15-43-44_260595134347_rgbf000096-resize_6',
                  '001222_2014-06-17_16-24-06_260595134347_rgbf000073-resize_0',
                  '001498_2014-06-19_17-45-14_260595134347_rgbf000129-resize_4',
                  '001540_2014-06-20_17-01-05_260595134347_rgbf000086-resize_2']

In [125]:
catch_trial_cleaned_data, exclude = catchTrial_cleaning(age_cleaned_data, 6, all_catch_stim, sequences_count_dict, exclude)
len(exclude)

Number of participants that did not pass the catch trial check: 3
Participants that were removed: [959701, 713277, 602202]


3

In [126]:
def removeMissedTrials(df, exclude, num_trials):
    """
    Participants were told that if they missed a trial, to respond '0'.
    This function removes those trials, and keeps track of:
    (1) How many missed trials per participant
    (2) Number of missed trials per duration 
    (3) Number of missed trials per sequence 
    """
    
    missedTrials_participants = {}
    missedTrials_durations = {}
    missedTrials_sequences = {}
    
    
    for idx, row in df.iterrows():
        estimate = row['depth_estimate']
        if estimate == 0.0:
            subjID = row['subjID']
            duration = row['duration']
            sequenceName = row['sequenceName']
            
            if subjID not in missedTrials_participants:
                missedTrials_participants[subjID] = 1
            else:
                missedTrials_participants[subjID] += 1

            if duration not in missedTrials_durations:
                missedTrials_durations[duration] = 1
            else:
                missedTrials_durations[duration] += 1
            
            if sequenceName not in missedTrials_sequences:
                missedTrials_sequences[sequenceName] = 1
            else:
                missedTrials_sequences[sequenceName] += 1
                        
            # remove trials with depth estimate = 0 
            df.drop(idx, inplace=True)
    
    # remove participants data if the participant's missed trial count is 10% or more of num_trials
    threshold = math.floor(num_trials * 0.1)
    remove_ids = []
    for key in missedTrials_participants:
        if missedTrials_participants[key] >= threshold:
            remove_ids.append(key)
    print("Number of participants with 10% or more missed trials: ", len(remove_ids))

    for index, row in df.iterrows():
        if row['subjID'] in remove_ids:
            df.drop(index, inplace=True)
            
    exclude += remove_ids
    
    return df, exclude

In [127]:
# total number of meaningful trials (excludes catch-trials)
num_trials = 156

In [128]:
missed_trial_cleaned_data, exclude = removeMissedTrials(catch_trial_cleaned_data,exclude, num_trials)
len(exclude)

Number of participants with 10% or more missed trials:  0


3

In [129]:
def RT_Cleaning(df, exclude, outlier_range, num_trials):
    #List unique values in the df['subjID'] column
    all_subjIDs = df.subjID.unique()
    
    remove = []
    df2_list = []
    for subj in all_subjIDs:
        count = 0
        subj_df = df.loc[df['subjID'] == subj]
        cleaned_subj_df = subj_df.copy(deep=True) # prevent setting with copy warning 
        # calculate subject's average trial RT 
        average_trial_RT = subj_df["trial_RT"].mean()
        std_trial_RT = subj_df["trial_RT"].std()

        for idx, row in subj_df.iterrows():
            RT = row["trial_RT"]
            if RT < outlier_range[0]: # outlier
                cleaned_subj_df.drop([idx], inplace=True)
                count += 1
            if RT > outlier_range[1]:
                cleaned_subj_df.drop([idx], inplace=True)
                count += 1
                
        threshold = math.floor(num_trials * 0.1)
        if count >= threshold:
            remove.append(subj)
        
        df2_list.append(cleaned_subj_df)
    
    df2 = pd.concat(df2_list)
            
    print("Number of Participants with 10% or more trials outside their RT range: ", len(remove))
    
    for index, row in df2.iterrows():
        if row['subjID'] in remove:
            df2.drop(index, inplace=True)
            
    exclude += remove
    
    return df2, exclude



In [130]:
RT_cleaned_data, exclude = RT_Cleaning(missed_trial_cleaned_data, exclude,[250, 10000], num_trials)
len(exclude)

Number of Participants with 10% or more trials outside their RT range:  1


4

In [131]:
def repeatResponses_Cleaning(df, exclude):
    """
    Some participants gave'junk data' - same number repeated for many trials 
    Count the frequency of unique responses entered by the participant. 
    If you look at the maximum number of repeats and/or the number of unique responses / 48 per participant you will find our vandals.
    """
    #List unique values in the df['subjID'] column
    all_subjIDs = df.subjID.unique()
    
    remove = []
    max_repeats_distribution = []
    num_unique_responses_distribution = []
    for subj in all_subjIDs:
        subj_df = df.loc[df['subjID'] == subj]
        # ideally, the max repeats and num_unique_responses should be ~ 48 since there are 48 imgs at each depth bin 
        count_depth_estimates = subj_df['depth_estimate'].value_counts()
        num_unique_responses = len(count_depth_estimates)
        num_unique_responses_distribution.append(num_unique_responses)
        max_repeats = count_depth_estimates.max()
        max_repeats_distribution.append(max_repeats)
        if num_unique_responses < 6:
            remove.append(subj)
    
    avg_max_repeats = np.array(max_repeats_distribution).mean()
    std_max_repeats = np.array(max_repeats_distribution).std()
    
    for subj in all_subjIDs:
        subj_df = df.loc[df['subjID'] == subj]
        count_depth_estimates = subj_df['depth_estimate'].value_counts()
        max_repeats = count_depth_estimates.max()

        outlierrange = [avg_max_repeats - (3*std_max_repeats), avg_max_repeats + (3*std_max_repeats)]
        if max_repeats < outlierrange[0]:
            if subj not in remove:
                remove.append(subj)
        if max_repeats > outlierrange[1]:
            if subj not in remove:
                remove.append(subj)
                
    print("Number of participants removed: repeat responses: ", len(remove))
    
    for index, row in df.iterrows():
        if row['subjID'] in remove:
            df.drop(index, inplace=True)

    exclude += remove
    
    return df, max_repeats_distribution, num_unique_responses_distribution, exclude



In [132]:
repeat_resp_cleaned_data, max_repeats_distrib, num_unique_distrib, exclude = repeatResponses_Cleaning(RT_cleaned_data, exclude)
len(exclude)

Number of participants removed: repeat responses:  0


4

In [133]:
def finalTrialCountCheck(df, exclude, num_trials):
    """
    If more then 10% of a participants data is missing, remove the participant
    """
    #List unique values in the df['subjID'] column
    all_subjIDs = df.subjID.unique()
    
    remove = []
    for subj in all_subjIDs:
        subj_df = df.loc[df['subjID'] == subj]
        count_trials = len(subj_df.index)
        threshold_trials_remaining = num_trials - math.floor(num_trials * 0.1)

        if count_trials <= threshold_trials_remaining:
            remove.append(subj)
            
    print("Number of Participants with >= 10% trials removed: ", len(remove))
    
    for index, row in df.iterrows():
        if row['subjID'] in remove:
            df.drop(index, inplace=True)
            
    exclude += remove
        
    print("Number of participants left: ",len(df.subjID.unique()))
    return df, exclude

In [134]:
cleaned_data, exclude = finalTrialCountCheck(repeat_resp_cleaned_data, exclude, num_trials)
len(exclude)

Number of Participants with >= 10% trials removed:  0
Number of participants left:  5


4

In [135]:
def get_sequences_for_replacement(exclude, datafolder):
    
    og_data = combineCSVs(datafolder)
    
    seqs_to_be_replaced = []
    for subjID in exclude:
        subjdf = og_data.loc[og_data['subjID'] == subjID]
        seqs_to_be_replaced.append(subjdf.sequenceName.unique()[0])
    print('Number of Sequences to be replaced:', len(seqs_to_be_replaced))
    return seqs_to_be_replaced

In [136]:
sequences_to_replace = get_sequences_for_replacement(exclude, data_path)

Number of participants before cleaning:  9
Number of Sequences to be replaced: 4


In [137]:
counterbalancing_path = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/v2_depth_duration_MTurk/counterbalancing.csv'
counterbalancing_df = pd.read_csv(counterbalancing_path)
counterbalancing_df

Unnamed: 0,Path,Sampled
0,jsons/VE125_randls_0_rotated.json,0
1,jsons/0908_VE250_randls_0_rotated.json,0
2,jsons/0914_VE1000_randls_0_rotated.json,0


In [138]:
sequences_to_replace[0]

'jsons/VE125_randls_0_rotated.json'

In [139]:
url_fragments = []
for sequence in sequences_to_replace:
    url_fragments.append(counterbalancing_df.index[counterbalancing_df['Path']==sequence][0] + 1)

In [143]:
# number for the NEXT batch 
batch = 1

In [141]:
dest_variables_csv = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/v2_depth_duration_MTurk/mturk_batch_variables/'

base_url = 'http://54.235.29.9/FacialAge/BNav_EC2/DepthDuration/v2_depth_duration_MTurk/v2_DepthDuration_HTML.html#'

variables = {'experiment_url': []}

for fragment in url_fragments:
     variables['experiment_url'].append(base_url + str(fragment))

variables_df = pd.DataFrame(variables)

In [142]:
variables_df.to_csv(dest_variables_csv + 'depth_duration_variables' + '_' + str(batch) + '.csv', index=False)
