# Participant Exclusion and Replacement Pipeline

1. The counterbalancing.csv contains every json that needs to be sampled for the discrimination and verbal judgement experiment (includes all durations) 
    - The row number for each sequence corresponds to the url fragment used in the variables file uploaded to Mechanical Turk 
    - This file does not change, the variables files is updated to resample sequences that get excluded
2. First need to match up reported participant worker IDs to worker IDs reported in batch data 
    - All data files downloaded from the server need to be matched to a worker ID in batch data
    - Data files that do not have a matched worker ID are moved to a seperate folder and are not analyzed 
3. Participant exclusion criteron are pre-registered on OSF (link) - if the participant is excluded, the counterbalanced sequence needs to be replaced in the variables file 
4. All participants who have completed need to be excluded from completing future HITs (exclude_workers.csv)

In [1]:
import os
import json 
import pandas as pd 
import numpy as np 

## Get all Worker IDs from Batch data 

In [18]:
batch_path = '/Users/prachi/Documents/depth_duration/verbal_judgement_analysis/counterbalanced_data_collection_pipeline/batch_data'


In [19]:
all_batch_worker_ids = []
for path in os.listdir(batch_path):   
    if 'csv' in path:
        batch_data = pd.read_csv(batch_path + '/' + path)
        batch_worker_ids = list(batch_data['WorkerId'])
        all_batch_worker_ids += batch_worker_ids


## Get all Worker IDs from Data Files 

Worker ID is saved in the name of the data file 

# THIS PART STILL NEEDS TO BE BUILT IN

# Participant Exclusion

In [36]:
def combineCSVs(datafolder):
    """
    Combine all participant data into one pandas df
    OR 
    Create df for single participant file 
    """
        
    #checks if path is a file
    isFile = os.path.isfile(datafolder)

    #checks if path is a directory
    isDirectory = os.path.isdir(datafolder)
    
    if isDirectory == True:
        data = []
        for filename in os.listdir(datafolder):
            if 'csv' in filename:
                path = datafolder + "/" + filename
                df = pd.read_csv(path, index_col=None, header=0)
                data.append(df)

        input_frame = pd.concat(data, axis=0, ignore_index=True)
        
    if isFile == True:
        if 'csv' in datafolder:
            input_frame = pd.read_csv(datafolder, index_col=None, header=0)
    
    print('Number of participants before cleaning: ', len(input_frame.subjID.unique()))
 
    return input_frame

def feet_to_meters(ft):
    """
    Args: ft = float value in feet 
        
    Returns: m = float value converted to meters 
    """
    m = ft * 0.3048
    return m

def getUnitConveredData(datafolder):
    '''
    Convert all responses given in feet to meters 
    '''
    input_data = combineCSVs(datafolder) # combine CSVs from all participants 
    
    for idx, row in input_data.iterrows():
        unit = row['unitSelection']
        # if estimate was made in feet, convert to meters 
        if unit == 'feet':
            estim_ft = row['depth_estimate']
            estim_m = feet_to_meters(estim_ft)
            # update depth estimates in existing dataframe
            input_data.at[idx, 'depth_estimate'] = estim_m
            # update units in existing dataframe
#             input_data.at[idx, 'unitSelection'] = 'meters'
    
    return input_data

def cleanAgeResponses(datafolder):
    '''
    Participants on MTurk must be over 18
    - If participants report they are < 18, exclude from analysis 
    
    NEED TO CHANGE HOW AGE IS COLLECTED
    
    '''
    input_data = getUnitConveredData(datafolder)
    exclude = []
    
    for idx, row in input_data.iterrows():
        age = row['age']
        # if year of birth was given, convert to age
#         if age > 1920:
#             actual_age = 2022-age
#             # update age in existing dataframe
#             input_data.at[idx, 'age'] = actual_age
        # participants must be over 18 so age reports below 18 are junk 
        if age < 18:
            # CHANGE THIS TO WORKER ID LATER 
            exclude.append(row['subjID'])
    print('Number of participants excluded due to age: ', len(exclude))
    return input_data, exclude 

In [37]:
data_path = '/Users/prachi/Documents/depth_duration/target_at_center/january2022_data/VE_data'

age_cleaned_data, exclude = cleanAgeResponses(data_path)

Number of participants before cleaning:  125
Number of participants excluded due to age:  0


In [35]:
def catchTrial_cleaning(df, correct_requirement, catch_stimuli, sequences_dict, exclude):
    '''
    Participants complete 8 catch trials total to ensure that they are doing the task.
    If less than 7/8 catch trials are correct, the participant is excluded.  
    '''
    
    all_subjIDs = df.subjID.unique()
    remove = []
    subj_sequence = {}
    df2_list = []
    sequence_count = {'jsons/VE125_randls_0_rotated.json': 0}

    
    for subj in all_subjIDs:
#         print(subj)
        subj_df = df.loc[df['subjID'] == subj]
        cleaned_subj_df = subj_df.copy(deep=True) # prevent setting with copy warning
        subj_sequence[subj] = subj_df.sequenceName.unique()[0]
        
        count_correct = 0
        for idx, row in subj_df.iterrows():
            stim = row['stimulus']
            if type(stim) == str:
                if stim.split('/')[1] in catch_stimuli:
    #                 print(stim.split('/')[1])
                    ####### VERSION WHERE CATCH TRIALS ARE ATTENTION CHECK: IMAGE HAS NO TARGET
#                     print(row['depth_estimate'])
#                     print(row['stimulus'])
                    if row["depth_estimate"] == 0:
                        count_correct += 1

                    # remove catch trial 
                    cleaned_subj_df.drop([idx], inplace=True)

        if count_correct < correct_requirement:
            remove.append(subj)
        else:
            sequence_count[subj_df.sequenceName.unique()[0]] += 1
        
        df2_list.append(cleaned_subj_df)
    
    df2 = pd.concat(df2_list)
    print("Number of participants that did not pass the catch trial check:", len(remove))
    print("Participants that were removed:",remove)

    for index, row in df2.iterrows():
        if row['subjID'] in remove:
            df2.drop(index, inplace=True)
    
    return df2

0