# VE Participant Exclusion and Sequence Re-sampling

In [1]:
import os
import json 
import numpy as np 
import pandas as pd
import math 
import scipy 

In [2]:
def combineCSVs(datafolder):
    """
    Combine all participant data into one pandas df
    OR 
    Create df for single participant file 

    exclude: list of subject IDs that should be excluded from the final df 

    """
        
    #checks if path is a file
    isFile = os.path.isfile(datafolder)

    #checks if path is a directory
    
    isDirectory = os.path.isdir(datafolder)
    
    if isDirectory == True:
        data = []
        for filename in os.listdir(datafolder):
            if 'csv' in filename:
                path = datafolder + "/" + filename
                df = pd.read_csv(path, index_col=None, header=0)
                data.append(df)

        input_frame = pd.concat(data, axis=0, ignore_index=True)
        
    if isFile == True:
        if 'csv' in datafolder:
            input_frame = pd.read_csv(datafolder, index_col=None, header=0)
    
    print('Number of participants before cleaning: ', len(input_frame.subjID.unique()))

    return input_frame


def feet_to_meters(ft):
    """
    Args: 
        ft = float value in feet 
        
    returns:
        m = float value converted to meters 
    """
    m = ft * 0.3048
    return m

def getUnitConveredData(datafolder):
    input_data = combineCSVs(datafolder) # combine CSVs from all participants 
    
    for idx, row in input_data.iterrows():
        unit = row['unitSelection']
        # if estimate was made in feet, convert to meters 
        if unit == 'feet':
            estim_ft = row['depth_estimate']
            estim_m = feet_to_meters(estim_ft)
            # update depth estimates in existing dataframe
            input_data.at[idx, 'depth_estimate'] = estim_m
            # update units in existing dataframe
#             input_data.at[idx, 'unitSelection'] = 'meters'
    
    return input_data

def cleanAgeResponses(datafolder):
    input_data = getUnitConveredData(datafolder)
    
    for idx, row in input_data.iterrows():
        age = row['age']
#         # if year of birth was given, convert to age
#         if age > 100:
#             actual_age = 2022-age
#             # update age in existing dataframe
#             input_data.at[idx, 'age'] = actual_age 
#         if 90 < age < 100:
#             actual_age = None
#             # update age in existing dataframe
#             input_data.at[idx, 'age'] = actual_age 
    
    return input_data    

def catchTrial_cleaning(datafolder, correct_requirement, catch_stimuli):
    '''
    Participants complete 8 catch trials total to ensure that they are doing the task.
    If less than 6 catch trials are correct, the participant is excluded.  
    '''
    df = cleanAgeResponses(datafolder)

    
    all_subjIDs = df.subjID.unique()
    remove = []
    df2_list = []
    
    for subj in all_subjIDs:
        subj_df = df.loc[df['subjID'] == subj]
        cleaned_subj_df = subj_df.copy(deep=True) # prevent setting with copy warning
        
        count_correct = 0
        for idx, row in subj_df.iterrows():
            stim = row['stimulus']
            if type(stim) == str:
                if stim.split('/')[1] in catch_stimuli:
                    ####### VERSION WHERE CATCH TRIALS ARE ATTENTION CHECK: IMAGE HAS NO TARGET
                    if row["depth_estimate"] == 0:
                        count_correct += 1

                    # remove catch trial 
                    cleaned_subj_df.drop([idx], inplace=True)

        if count_correct < correct_requirement:
            remove.append(subj)
        
        df2_list.append(cleaned_subj_df)
    
    df2 = pd.concat(df2_list)
    print("Number of participants that did not pass the catch trial check:", len(remove))

    for index, row in df2.iterrows():
        if row['subjID'] in remove:
            df2.drop(index, inplace=True)
    
    return df2
    

In [3]:
all_catch_stim = ['000375_2014-06-08_11-17-29_260595134347_rgbf000133-resize_2',
                  '000569_2014-06-09_22-51-47_260595134347_rgbf000141-resize_3',
                  '000787_2014-06-08_22-33-53_260595134347_rgbf000175-resize_1',
                  '002072_2014-06-24_21-48-06_260595134347_rgbf000115-resize_0',
                  '001170_2014-06-17_15-43-44_260595134347_rgbf000096-resize_6',
                  '001222_2014-06-17_16-24-06_260595134347_rgbf000073-resize_0',
                  '001498_2014-06-19_17-45-14_260595134347_rgbf000129-resize_4',
                  '001540_2014-06-20_17-01-05_260595134347_rgbf000086-resize_2']

In [163]:
data_path = '/Users/pmahableshwarkar/Downloads/v2_depth_duration_MTurk/temp_b3'

In [164]:
catch_trial_cleaned_data = catchTrial_cleaning(data_path, 6, all_catch_stim)
step1_cleaned_data = catch_trial_cleaned_data.copy(deep=True)

Number of participants before cleaning:  134
Number of participants that did not pass the catch trial check: 54


In [165]:
def removeMissedTrials(input_data):
    """
    Participants were told that if they missed a trial, to respond '0'.
    This function removes those trials, and keeps track of:
    (1) How many missed trials per participant
    (2) Number of missed trials per duration 
    (3) Number of missed trials per sequence 
    """
    
    missedTrials_participants = {}
    missedTrials_durations = {}
    missedTrials_sequences = {}
    
    
    for idx, row in input_data.iterrows():
        estimate = row['depth_estimate']
        # do catch trial check FIRST
        # then have the missing trial function 
        if estimate == 0.0:
            subjID = row['subjID']
            duration = row['duration']
            sequenceName = row['sequenceName']
            
            if subjID not in missedTrials_participants:
                missedTrials_participants[subjID] = 1
            else:
                missedTrials_participants[subjID] += 1

            if duration not in missedTrials_durations:
                missedTrials_durations[duration] = 1
            else:
                missedTrials_durations[duration] += 1
            
            if sequenceName not in missedTrials_sequences:
                missedTrials_sequences[sequenceName] = 1
            else:
                missedTrials_sequences[sequenceName] += 1
                    
            # remove trials with depth estimate = 0 
            input_data.drop(idx, inplace=True)
    
    # remove participants data if the participant's missed trial count is 10% or more of num_trials
    threshold = math.floor(156 * 0.1)

    remove_ids = []
    for key in missedTrials_participants:
        if missedTrials_participants[key] >= threshold:
            remove_ids.append(key)
    print("Number of participants with 10% or more missed trials: ", len(remove_ids))

    for index, row in input_data.iterrows():
        if row['subjID'] in remove_ids:
            input_data.drop(index, inplace=True)

    # Note if a particular participant, duration, or sequence has maximum missing trials
    # ** If the participant had no missed trials, then ID will not show up in dict 
#     print("Missed Trials")
#     print(missedTrials_participants)
#     print(missedTrials_durations)
#     print(missedTrials_sequences)

    
    return input_data

In [166]:
missed_trial_cleaned_data = removeMissedTrials(step1_cleaned_data)
step2_cleaned_data = missed_trial_cleaned_data.copy(deep=True)

Number of participants with 10% or more missed trials:  5


In [167]:
def RT_Cleaning(df, outlier_range):
    #List unique values in the df['subjID'] column
    all_subjIDs = df.subjID.unique()
    
    remove = []
    df2_list = []
    for subj in all_subjIDs:
        count = 0
        subj_df = df.loc[df['subjID'] == subj]
        cleaned_subj_df = subj_df.copy(deep=True) # prevent setting with copy warning 
        # calculate subject's average trial RT 
        average_trial_RT = subj_df["trial_RT"].mean()
        std_trial_RT = subj_df["trial_RT"].std()

        for idx, row in subj_df.iterrows():
            RT = row["trial_RT"]
            if RT < outlier_range[0]: # outlier
                cleaned_subj_df.drop([idx], inplace=True)
                count += 1
            if RT > outlier_range[1]:
                cleaned_subj_df.drop([idx], inplace=True)
                count += 1
                
        threshold = math.floor(156 * 0.1)
        if count >= threshold:
            remove.append(subj)
        
        df2_list.append(cleaned_subj_df)
    
    df2 = pd.concat(df2_list)
            
    print("Number of Participants with 10% or more trials outside their RT range: ", len(remove))
    
    for index, row in df2.iterrows():
        if row['subjID'] in remove:
            df2.drop(index, inplace=True)
    
    return df2


In [168]:
RT_cleaned_data = RT_Cleaning(step2_cleaned_data, [250, 10000])
step3_cleaned_data = RT_cleaned_data.copy(deep=True)

Number of Participants with 10% or more trials outside their RT range:  10


In [169]:
def repeatResponses_Cleaning(df):
    """
    Some participants give'junk data' - same number repeated for many trials 
    Count the frequency of unique responses entered by the participant. 
    If you look at the maximum number of repeats and/or the number of unique responses / 48 per participant these participants can be caught
    """
    #List unique values in the df['subjID'] column
    all_subjIDs = df.subjID.unique()
    
    remove = []
    max_repeats_distribution = []
    num_unique_responses_distribution = []
    for subj in all_subjIDs:
        subj_df = df.loc[df['subjID'] == subj]
        # ideally, the max repeats and num_unique_responses should be ~ 48 since there are 48 imgs at each depth bin 
        count_depth_estimates = subj_df['depth_estimate'].value_counts()
        num_unique_responses = len(count_depth_estimates)
        num_unique_responses_distribution.append(num_unique_responses)
        max_repeats = count_depth_estimates.max()
        max_repeats_distribution.append(max_repeats)
        if num_unique_responses < 6:
            remove.append(subj)
    
    avg_max_repeats = np.array(max_repeats_distribution).mean()
    std_max_repeats = np.array(max_repeats_distribution).std()
    
    for subj in all_subjIDs:
        subj_df = df.loc[df['subjID'] == subj]
        count_depth_estimates = subj_df['depth_estimate'].value_counts()
        max_repeats = count_depth_estimates.max()

        outlierrange = [avg_max_repeats - (3*std_max_repeats), avg_max_repeats + (3*std_max_repeats)]
        if max_repeats < outlierrange[0]:
            if subj not in remove:
                remove.append(subj)
        if max_repeats > outlierrange[1]:
            if subj not in remove:
                remove.append(subj)
                
    print("Number of participants removed: repeat responses: ", len(remove))
    
    for index, row in df.iterrows():
        if row['subjID'] in remove:
            df.drop(index, inplace=True)

    return df, max_repeats_distribution, num_unique_responses_distribution


In [170]:
repeat_resp_cleaned_data, max_repeats_distrib, num_unique_distrib = repeatResponses_Cleaning(step3_cleaned_data)
step4_cleaned_data = repeat_resp_cleaned_data.copy(deep=True)

Number of participants removed: repeat responses:  14


In [173]:
def finalTrialCountCheck(df):
    """
    If more then 10% of a participants data is missing, remove the participant
    """
    #List unique values in the df['subjID'] column
    all_subjIDs = df.subjID.unique()
    
    remove = []
    for subj in all_subjIDs:
        subj_df = df.loc[df['subjID'] == subj]
        count_trials = len(subj_df.index)
        threshold_trials_remaining = 156 - math.floor(156 * 0.1)

        if count_trials <= threshold_trials_remaining:
            remove.append(subj)
            
    print("Number of Participants with >= 10% trials removed: ", len(remove))
    
    for index, row in df.iterrows():
        if row['subjID'] in remove:
            df.drop(index, inplace=True)
        
    print("Number of participants left: ",len(df.subjID.unique()))
    
    return df


In [174]:
cleaned_data = finalTrialCountCheck(step4_cleaned_data)

Number of Participants with >= 10% trials removed:  0
Number of participants left:  51


In [175]:
final_data = cleaned_data.copy(deep=True)

In [176]:
final_participant_count = len(final_data.subjID.unique())
final_participant_count

51

## Update Sequence Sampling 

In [177]:
# set the version for the sequence tracking 

prev_version = 'v2'
new_version = 'v3'

# select path for the last previous sequence tracking file 

sequence_sampling_path = '/Users/pmahableshwarkar/Downloads/v2_depth_duration_MTurk/sequence_tracking/'+ prev_version + '_VE_master_sequence_tracking.json'


In [178]:

# Opening JSON file
f = open(sequence_sampling_path)
  
# returns JSON object as 
# a dictionary
sequence_sampling = json.load(f)
# print number of sequences that have been sampled by the previous batch
c = 0
for seq in sequence_sampling:
    if len(sequence_sampling[seq]) > 0:
        c += 1
print('Number of sequences previously sampled: ', c)


new_sequence_sampling = sequence_sampling
# update sequence sampling dictionary
for subj in final_data.subjID.unique():
    subj_df = final_data.loc[final_data['subjID'] == subj]
    subj_seq = subj_df.sequenceName.unique()[0].split('/')[1]
    # add subj to list for its corresponding sequence
    if subj not in new_sequence_sampling[subj_seq]:
        new_sequence_sampling[subj_seq].append(str(subj))
    
c1 = 0
for seq in new_sequence_sampling:
    if len(new_sequence_sampling[seq]) > 0:
        c1 += 1
print('Number of sequences sampled now: ', c1)

print("Check that # prev sampled + batch final participant count = # sequences sampled now: ", c + final_participant_count == c1)


seq_track_path = '/Users/pmahableshwarkar/Downloads/v2_depth_duration_MTurk/sequence_tracking/'

with open(seq_track_path + new_version + "_VE_master_sequence_tracking.json", "w") as outfile:
    json.dump(new_sequence_sampling, outfile)

Number of sequences previously sampled:  59
Number of sequences sampled now:  110
Check that # prev sampled + batch final participant count = # sequences sampled now:  True


## Find sequences to replace

In [161]:
sequences_to_replace = []

for seq_key in new_sequence_sampling:
    if len(new_sequence_sampling[seq_key]) == 0:
        sequences_to_replace.append(seq_key)

len(sequences_to_replace)

409

## Create new batch variables file

In [139]:
counterbalancing_path = '/Users/pmahableshwarkar/Downloads/v2_depth_duration_MTurk/counterbalancing.csv'
counterbalancing_df = pd.read_csv(counterbalancing_path)


In [140]:
# cross-check with server console log
sequences_to_replace[0]

'v1_VE250_randls_59.json'

In [141]:
count_1000 = 0
count_250 = 0
count_125 = 0
for seq in sequences_to_replace:
    if 'VE1000' in seq:
        count_1000 += 1
    if 'VE250' in seq:
        count_250 += 1
    if 'VE125' in seq:
        count_125 += 1

count_1000, count_250, count_125

(156, 119, 156)

## Indexing Notes

The row in the counterbalancing csv does NOT match the url fragment since the indexing includes the path row.

The url fragment is the counterbalancing df index + 1 --> this has been validated in the console log of the experiment

To backtrack from the url fragments to the corresponding row of the counterbalancing csv: row = url_fragment + 1

In [None]:
url_fragments = []
for sequence in sequences_to_replace:
    seq_p = 'jsons/' + sequence
    url_fragments.append(counterbalancing_df.index[counterbalancing_df['Path']==seq_p][0] + 1)

In [None]:
# number for the NEXT batch 
batch = 10

In [None]:
dest_variables_csv = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/v2_depth_duration_MTurk/mturk_batch_variables/'

base_url = 'http://54.235.29.9/FacialAge/BNav_EC2/DepthDuration/v2_depth_duration_MTurk/v2_DepthDuration_HTML.html#'

variables = {'experiment_url': []}

for fragment in url_fragments:
     variables['experiment_url'].append(base_url + str(fragment))

variables_df = pd.DataFrame(variables)

In [None]:
variables_df.to_csv(dest_variables_csv + 'depth_duration_variables' + '_b' + str(batch) + '.csv', index=False)
