# Discrimination: Participant Exclusion and Replacement Pipeline

1. The counterbalancing.csv contains every json that needs to be sampled for the discrimination and verbal judgement experiment (includes all durations) 
    - The row number for each sequence corresponds to the url fragment used in the variables file uploaded to Mechanical Turk 
    - This file does not change, the variables files is updated to resample sequences that get excluded
2. First need to match up reported participant worker IDs to worker IDs reported in batch data 
    - All data files downloaded from the server need to be matched to a worker ID in batch data
    - Data files that do not have a matched worker ID are moved to a seperate folder and are not analyzed 
3. Participant exclusion criteron are pre-registered on OSF (https://osf.io/28vjd) - if the participant is excluded, the counterbalanced sequence needs to be replaced in the variables file 
4. All participants who have completed need to be excluded from completing future HITs (exclude_workers.csv)

In [402]:
import os
import json 
import pandas as pd 
import numpy as np 
import math
import shutil

## Get all Worker IDs from Batch data 

In [403]:
batch_path = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/v2_depth_discrimination_MTurk/all_batch'


In [404]:
all_batch_worker_ids = []
for path in os.listdir(batch_path):   
    if 'csv' in path:
        batch_data = pd.read_csv(batch_path + '/' + path)
        batch_worker_ids = list(batch_data['WorkerId'])
        all_batch_worker_ids += batch_worker_ids
print(len(set(all_batch_worker_ids)))


1061


## Get all Worker IDs from Data Files 


In [405]:
datapath = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/v2_depth_discrimination_MTurk/data'

workerid_filename_dict = {}
worker_ids_from_data = []

for file in os.listdir(datapath):
    if 'csv' in file:
        path = datapath + "/" + file
        df = pd.read_csv(path, index_col=None, header=0)
        worker_ids_from_data.append(df.workerId.unique()[0])
        workerid_filename_dict[df.workerId.unique()[0]] = file



In [406]:
set(worker_ids_from_data) == set(all_batch_worker_ids)

False

In [407]:
# get the worker IDs that are in the data but NOT in the batch data
# these data files should be moved to an archive and NOT analyzed
batchdata_workerIDs = set(all_batch_worker_ids)
move_files = []
missing_wid = set([wid for wid in worker_ids_from_data if wid not in batchdata_workerIDs])

print(len(missing_wid))
for wid in missing_wid:
    move_files.append(workerid_filename_dict[wid])
print(len(move_files))

2
2


In [80]:
# current_dir = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/v2_depth_discrimination_MTurk/data'
# dest_dir = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/v2_depth_discrimination_MTurk/data_archive'

# for file in move_files:
#     shutil.move(current_dir + '/' + file, dest_dir + '/' + file)

# Participant Exclusion

In [408]:
def combineCSVs(datafolder, num_discrim_trials):
    """
    Combine all participant data into one pandas df
    OR 
    Create df for single participant file 
    
    returns:
        (1) combined dataframe of all discrimination data 
        (2) combined dataframe of all scene property rating data 
    """
    #checks if path is a file
    isFile = os.path.isfile(datafolder)

    #checks if path is a directory
    
    isDirectory = os.path.isdir(datafolder)
    
    if isDirectory == True:
        discrim_data = []
        for filename in os.listdir(datafolder):
            if 'csv' in filename:
                path = datafolder + "/" + filename
                df = pd.read_csv(path, index_col=None, header=0)
                
                df_discrim = df[0:num_discrim_trials]
                discrim_data.append(df_discrim)

        discrim_frame = pd.concat(discrim_data, axis=0, ignore_index=True)
        
    if isFile == True:
        if 'csv' in datafolder:
            df = pd.read_csv(datafolder, index_col=None, header=0)
            df_discrim = df[0:num_discrim_trials]
            discrim_data.append(df_discrim)
 
    return discrim_frame

In [409]:
data_path = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/v2_depth_discrimination_MTurk/data'
num_total_trials = 86

In [410]:
raw_discrim = combineCSVs(data_path, num_total_trials)


In [411]:
# raw_discrim

In [412]:
all_subjIDs = raw_discrim.subjID.unique()
len(all_subjIDs)

1054

In [413]:
all_ages = raw_discrim.age.unique()
all_ages

array([  24,   28,   31,   38,   45,   33,   35,   34,   55,   27,   32,
         40,   51,   58,   29,   36,   46,   25,   42,   48,   30,   37,
         41,   26,   53,   50,   64, 1997,   61,   57,   47,   22,   23,
         70, 1970, 1994,   56,   59,   52,   44,   21,   43,   69, 1974,
         62,   49, 1990,   65, 1989,   39, 1960, 1965,   60, 1976,   63,
         19,   54, 1985,   67,   20, 1971, 1963,   66, 1980,   77,   76,
         71, 1973,   68, 1966, 1959, 1964])

## <font color='red'> Data Cleaning </font> 



In [414]:
def catchTrial_cleaning(df, correct_requirement, catch_stimuli):
    '''
    Participants complete 8 catch trials total to ensure that they are doing the task.
    If less than 6/8 catch trials are correct, the participant is excluded.  
    '''
    all_subjIDs = df.subjID.unique()
    remove = []
    subj_sequence = {}
    df2_list = []
    
    for subj in all_subjIDs:
        count_correct = 0
        subj_df = df.loc[df['subjID'] == subj]
        cleaned_subj_df = subj_df.copy(deep=True) # prevent setting with copy warning
        # 
        subj_sequence[subj] = subj_df.sequenceName.unique()[0]
        
        # remove trials that are outside the outlier range
        c = 0
        c_missed = 0
        for idx, row in subj_df.iterrows():
            stim1 = row['stimulus_0']
            stim2 = row['stimulus_1']
            # TEMP SOLUTION FOR RANDOM PHP DATA SAVING PROBLEM 
            if type(stim1) == str:
                if stim1.split('/')[1] in catch_stimuli or stim2.split('/')[1] in catch_stimuli:
                    ####### VERSION WHERE CATCH TRIALS ARE ATTENTION CHECK: IMAGE 1 IS THE SAME AS IMAGE 2
                    c += 1
                    if row["discrim_choice"] == 3:
#                         print(row["discrim_choice"])
                        count_correct += 1
#                     else:
#                         print(row["discrim_choice"])
                    # remove catch trial 
                    cleaned_subj_df.drop([idx], inplace=True)
    #                 print(depth0, depth1, correct_choice, choice)
#         print(c_missed, 'Number of catch trials where participants did not see the target')
#         print(c)
#         print(count_correct)
        if count_correct < correct_requirement:
#             print('Number correct:', count_correct)
            remove.append(subj)

        df2_list.append(cleaned_subj_df)
    
    df2 = pd.concat(df2_list)
    print("Number of participants that did not pass the catch trial check:", len(remove))


#     for index, row in df2.iterrows():
#         if row['subjID'] in remove:
#             df2.drop(index, inplace=True)
            
    for subj in remove:
        df2.drop(df2[df2['subjID'] == subj].index, inplace = True) 
    
    return df2
    
    

In [415]:
sequences_path = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/v2_depth_discrimination_MTurk/discrim_jsons'
sequences_count_dict = {}
for seq in os.listdir(sequences_path):
    if 'json' in seq:
        sequences_count_dict['discrim_jsons/'+seq] = 0


In [416]:
all_catch_stim = ['000375_2014-06-08_11-17-29_260595134347_rgbf000133-resize_2',
                  '000569_2014-06-09_22-51-47_260595134347_rgbf000141-resize_3',
                  '000787_2014-06-08_22-33-53_260595134347_rgbf000175-resize_1',
                  '002072_2014-06-24_21-48-06_260595134347_rgbf000115-resize_0',
                  '001170_2014-06-17_15-43-44_260595134347_rgbf000096-resize_6',
                  '001222_2014-06-17_16-24-06_260595134347_rgbf000073-resize_0',
                  '001498_2014-06-19_17-45-14_260595134347_rgbf000129-resize_4',
                  '001540_2014-06-20_17-01-05_260595134347_rgbf000086-resize_2']

In [417]:
catch_cleaned_discrim = catchTrial_cleaning(raw_discrim, 6, all_catch_stim)

Number of participants that did not pass the catch trial check: 574


In [418]:
step1_cleaned_data = catch_cleaned_discrim.copy(deep=True)

In [419]:
cleaned_ages = catch_cleaned_discrim.age.unique()
cleaned_ages

array([  28,   31,   38,   45,   33,   35,   34,   27,   40,   51,   58,
         29,   36,   42,   55,   37,   25,   26,   50,   53,   61,   22,
         46,   24, 1970,   30,   41,   56,   52,   44,   32,   43,   69,
         47,   48,   62,   49,   64, 1989,   39,   21,   60,   63,   65,
         57,   20,   67,   59,   66, 1980,   77,   54,   76,   23,   68])

In [420]:
def RT_cleaning(df, outlier_range, num_trials):
    all_subjIDs = df.subjID.unique()
    remove = []
    df2_list = []
    for subj in all_subjIDs:
        count = 0
        subj_df = df.loc[df['subjID'] == subj]
        cleaned_subj_df = subj_df.copy(deep=True) # prevent setting with copy warning
        
        # calculate subject's average trial RT
        average_trial_RT = subj_df["trial_RT"].mean()
        std_trial_RT = subj_df["trial_RT"].std()
        
        # remove trials that are outside the outlier range
        for idx, row in subj_df.iterrows():
            RT = row['trial_RT']
            if RT < outlier_range[0]:
                cleaned_subj_df.drop([idx], inplace=True)
                count += 1
            if RT > outlier_range[1]:
                cleaned_subj_df.drop([idx], inplace=True)
                count += 1
                
        threshold = math.floor(num_trials * 0.1)
        if count >= threshold:
            remove.append(subj)
        
        df2_list.append(cleaned_subj_df)
    
    df2 = pd.concat(df2_list)
    print("Number of Participants with 10% or more trials outside their RT range:", len(remove))
    
#     for index, row in df2.iterrows():
#         if row['subjID'] in remove:
#             df2.drop(index, inplace=True)
            
    for subj in remove:
        df2.drop(df2[df2['subjID'] == subj].index, inplace = True) 
    
    return df2


In [421]:
cleaned_discrim = RT_cleaning(catch_cleaned_discrim, [250,10000], 78)

step2_cleaned_data = cleaned_discrim.copy(deep=True)

Number of Participants with 10% or more trials outside their RT range: 15


In [422]:
def finalTrialCountCheck(df, num_trials):
    """
    If more then 10% of a participants data is missing, remove the participant
    """
    #List unique values in the df['subjID'] column\n",
    all_subjIDs = df.subjID.unique()
    remove = []
    for subj in all_subjIDs:
        subj_df = df.loc[df['subjID'] == subj]
        count_trials = len(subj_df.index)
        threshold_trials_remaining = num_trials - math.floor(num_trials * 0.1)
        
        if count_trials <= threshold_trials_remaining:
            remove.append(subj)
    print("Number of Participants with >= 10% trials removed:", len(remove))
    
#     for index, row in df.iterrows():
#         if row['subjID'] in remove:
#             df.drop(index, inplace=True)
            
    for subj in remove:
        df.drop(df[df['subjID'] == subj].index, inplace = True) 
    
    print("Number of participants left:",len(df.subjID.unique()))
    
    return df

In [424]:
final_discrim = finalTrialCountCheck(cleaned_discrim, 78)
final_data = final_discrim.copy(deep=True)

Number of Participants with >= 10% trials removed: 0
Number of participants left: 465


In [425]:
final_participant_count = len(final_data.subjID.unique())
final_participant_count

465

## Update Sequence Sampling 

In [426]:
# set the version for the sequence tracking 

prev_version = 'v9'
new_version = 'v10'

# select path for the last previous sequence tracking file 

sequence_sampling_path = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/v2_depth_discrimination_MTurk/sequence_tracking/'+ prev_version + '_Discrim_master_sequence_tracking.json'


In [427]:
# new_sequence_sampling

In [428]:

# Opening JSON file
f = open(sequence_sampling_path)
  
# returns JSON object as a dictionary
sequence_sampling = json.load(f)
# print number of sequences that have been sampled by the previous batch
prev_sampled_count = 0
for seq in sequence_sampling:
    if len(sequence_sampling[seq]) > 0:
        prev_sampled_count += 1
print('Number of sequences previously sampled: ', prev_sampled_count)       

new_sequence_sampling = sequence_sampling
# update sequence sampling dictionary
for subj in final_data.subjID.unique():
    subj_df = final_data.loc[final_data['subjID'] == subj]
    subj_seq = subj_df.sequenceName.unique()[0].split('/')[1]
    # add subj to list for its corresponding sequence
    new_sequence_sampling[subj_seq].append(str(subj))

    
sampled_count = 0
unsampled_count = 0
for seq in new_sequence_sampling:
    if len(new_sequence_sampling[seq]) > 0:
        # remove duplicates of the same id
        new_sequence_sampling[seq] = list(set(new_sequence_sampling[seq]))
        sampled_count += 1
    else:
        unsampled_count += 1
        new_sequence_sampling[seq] = []
        
print('Number of sequences sampled now: ', sampled_count, len(final_data.sequenceName.unique()))

print('Number of sequences to be sampled: ', unsampled_count)

print('Number sampled + to be sampled = 468: ', sampled_count + unsampled_count==468)


seq_track_path = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/v2_depth_discrimination_MTurk/sequence_tracking/'

with open(seq_track_path + new_version + "_Discrim_master_sequence_tracking.json", "w") as outfile:
    json.dump(new_sequence_sampling, outfile)
    

Number of sequences previously sampled:  464
Number of sequences sampled now:  465 465
Number of sequences to be sampled:  3
Number sampled + to be sampled = 468:  True


In [358]:
# new_sequence_sampling

## Find sequences to replace

In [429]:
sequences_to_replace = []

for seq_key in new_sequence_sampling:
    if len(new_sequence_sampling[seq_key]) == 0:
        sequences_to_replace.append(seq_key)

len(sequences_to_replace), unsampled_count == len(sequences_to_replace)

(3, True)

## Create new batch variables file

In [430]:
counterbalancing_path = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/v2_depth_discrimination_MTurk/counterbalancing.csv'
counterbalancing_df = pd.read_csv(counterbalancing_path)
counterbalancing_df

Unnamed: 0,Path,Sampled
0,discrim_jsons/Discrim250_randls_70_rotated.json,0
1,discrim_jsons/Discrim250_randls_25_rotated.json,0
2,discrim_jsons/Discrim125_randls_17.json,0
3,discrim_jsons/Discrim125_randls_5_rotated.json,0
4,discrim_jsons/Discrim1000_randls_20.json,0
...,...,...
463,discrim_jsons/Discrim250_randls_44_rotated.json,0
464,discrim_jsons/Discrim250_randls_71.json,0
465,discrim_jsons/Discrim1000_randls_58.json,0
466,discrim_jsons/Discrim250_randls_16_rotated.json,0


In [431]:
# cross-check with server console log
sequences_to_replace

['Discrim250_randls_0.json',
 'Discrim125_randls_73_rotated.json',
 'Discrim125_randls_19_rotated.json']

In [432]:
count_1000 = 0
count_250 = 0
count_125 = 0
for seq in sequences_to_replace:
    if 'Discrim1000' in seq:
        count_1000 += 1
    if 'Discrim250' in seq:
        count_250 += 1
    if 'Discrim125' in seq:
        count_125 += 1

count_1000, count_250, count_125

(0, 1, 2)

## Indexing Notes

The row in the counterbalancing csv does NOT match the url fragment since the indexing includes the path row.

The url fragment is the counterbalancing df index + 1 --> this has been validated in the console log of the experiment

To backtrack from the url fragments to the corresponding row of the counterbalancing csv: row = url_fragment + 1

In [433]:
url_fragments = []
for sequence in sequences_to_replace:
    seq_p = 'discrim_jsons/' + sequence
    url_fragments.append(counterbalancing_df.index[counterbalancing_df['Path']==seq_p][0] + 1)

In [434]:
# number for the NEXT batch 
batch = 13

In [435]:
dest_variables_csv = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/v2_depth_discrimination_MTurk/mturk_batch_variables/'

base_url = 'http://54.235.29.9/FacialAge/BNav_EC2/DepthDuration/v2_depth_discrimination_MTurk/v2_DepthDiscrim_HTML.html#'

variables = {'experiment_url': []}

for fragment in url_fragments:
     variables['experiment_url'].append(base_url + str(fragment))

variables_df = pd.DataFrame(variables)

In [436]:
variables_df.to_csv(dest_variables_csv + 'depth_discrimination_variables' + '_b' + str(batch) + '.csv', index=False)
