# MX Familiar Size Participant Exclusion

1. The counterbalancing.csv contains every json that needs to be sampled for familiar size experiment 
    - The row number for each sequence corresponds to the url fragment used in the variables file uploaded to Heroku server
    - This file does not change - only the variables files is updated to resample sequences that get excluded
2. Participant exclusion criteron are pre-registered on OSF (https://osf.io/u79qe) - if the participant is excluded, the counterbalanced sequence needs to be replaced in the variables file 
3. Keep a log of participant IDs that complete the experiment 

In [1]:
import os 
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import scipy 
import scipy.stats as stats
from scipy import stats
from statsmodels.stats.anova import AnovaRM
import copy
import datetime
import json
import random

In [2]:
num_trials = 54

In [3]:
def combineCSVs(datafolder):
    """
    Args:
        datafolder = path to data 
    Returns:
        df of all participant data 
        OR
        df for singlle participant
    """
        
    #checks if path is a file
    isFile = os.path.isfile(datafolder)

    #checks if path is a directory
    
    isDirectory = os.path.isdir(datafolder)
    
    if isDirectory == True:
        data = []
        for filename in os.listdir(datafolder):
            if 'csv' in filename:
                path = datafolder + "/" + filename
                df = pd.read_csv(path, index_col=None, header=0)
                if df.experimentName.unique() == 'final-intermixed-textured':
                    subjID = df.subjID.unique()[0]
                    data.append(df)

        input_frame = pd.concat(data, axis=0, ignore_index=True)
        
    if isFile == True:
        if 'csv' in datafolder:
            input_frame = pd.read_csv(datafolder, index_col=None, header=0)
    
    print('Number of participants before cleaning: ', len(input_frame.subjID.unique()))

 
    return input_frame


def feet_to_meters(ft):
    """
    Args: 
        ft = float value in feet 
        
    returns:
        m = float value converted to meters 
    """
    m = ft * 0.3048
    return m

def getUnitConveredData(datafolder):
    '''
    Args: 
        datafolder = path to data  
        
    returns:
        df with all estimates converted to meters      
    '''
    input_data = combineCSVs(datafolder) # combine CSVs from all participants 
    
    for idx, row in input_data.iterrows():
        unit = row['unitSelection']
        # if estimate was made in feet, convert to meters 
        if unit == 'feet':
            estim_ft = row['depth_estimate']
            estim_m = feet_to_meters(estim_ft)
            # update depth estimates in existing dataframe
            input_data.at[idx, 'depth_estimate'] = estim_m

    
    return input_data

def cleanAgeResponses(datafolder):
    '''
    Args: 
        datafolder = path to data  
        
    returns:
        df with cleaned reported age 
    '''
    input_data = getUnitConveredData(datafolder)
    
    for idx, row in input_data.iterrows():
        age = row['age']
        # if year of birth was given, convert to age
        today = datetime.date.today()
        year = today.year
        if age > 2000:
            actual_age = year-age
            # update age in existing dataframe
            input_data.at[idx, 'age'] = actual_age
            print(row['subjID'])
            print(actual_age, age)


    return input_data    
 

def catchTrial_cleaning(path, correct_requirement, sequence_count):
    '''
    Participants complete 6 catch trials total to ensure that they are doing the task.
    If less than 4/6 catch trials are correct, the participant is excluded.  
    '''
    
    df = cleanAgeResponses(path)
    
    all_subjIDs = df.subjID.unique()
    remove = []
    subj_sequence = {}
    df2_list = []
    
    for subj in all_subjIDs:
#         print(subj)
        subj_df = df.loc[df['subjID'] == subj]
        cleaned_subj_df = subj_df.copy(deep=True) # prevent setting with copy warning
        subj_sequence[subj] = subj_df.sequenceName.unique()[0]
        
        count_correct = 0
        for idx, row in subj_df.iterrows():
            stim = row['stimulus']
            if stim.split('/')[0] == 'catch_stimuli':
                ####### VERSION WHERE CATCH TRIALS ARE ATTENTION CHECK: IMAGE HAS NO TARGET
                if row["depth_estimate"] == 0:
                    count_correct += 1

                # remove catch trial 
                cleaned_subj_df.drop([idx], inplace=True)

        if count_correct < correct_requirement:
            remove.append(subj)
            print(count_correct)
        else:
            sequence_count[subj_df.sequenceName.unique()[0]] += 1
        
        df2_list.append(cleaned_subj_df)
    
    df2 = pd.concat(df2_list)
    print("Number of participants that did not pass the catch trial check:", len(remove))
    print("Participants that were removed:",remove)
            
    for subj in remove:
        df2.drop(df2[df2['subjID'] == subj].index, inplace = True) 
    
    return df2
    

def removeMissedTrials(input_data, num_trials):
    """
    Participants were told that if they missed a trial, to respond '0'.
    This function removes those trials, and keeps track of:
    (1) How many missed trials per participant
    (2) Number of missed trials per duration 
    (3) Number of missed trials per sequence 
    """
#     input_data = cleanAgeResponses(datafolder)
    
    missedTrials_participants = {}
    missedTrials_durations = {}
    missedTrials_sequences = {}
    
    
    for idx, row in input_data.iterrows():
        estimate = row['depth_estimate']
        if estimate == 0.0:
            subjID = row['subjID']
            duration = row['duration']
            sequenceName = row['sequenceName']
            
            if subjID not in missedTrials_participants:
                missedTrials_participants[subjID] = 1
            else:
                missedTrials_participants[subjID] += 1

            if duration not in missedTrials_durations:
                missedTrials_durations[duration] = 1
            else:
                missedTrials_durations[duration] += 1
            
            if sequenceName not in missedTrials_sequences:
                missedTrials_sequences[sequenceName] = 1
            else:
                missedTrials_sequences[sequenceName] += 1
            
#             print(subjID, duration, sequenceName)
            
            # remove trials with depth estimate = 0 
            input_data.drop(idx, inplace=True)
    
    # remove participants data if the participant's missed trial count is 10% or more of num_trials
    threshold = math.floor(num_trials * 0.1)
#     print("Missing Trial Count Threshold: ", threshold)
    remove_ids = []
    for key in missedTrials_participants:
        if missedTrials_participants[key] >= threshold:
            remove_ids.append(key)
    print("Number of participants with 10% or more missed trials: ", len(remove_ids))
            
    for subj in remove_ids:
        input_data.drop(input_data[input_data['subjID'] == subj].index, inplace = True) 

    # Note if a particular participant, duration, or sequence has maximum missing trials
    # ** If the participant had no missed trials, then ID will not show up in dict 
#     print("Missed Trials")
#     print(missedTrials_participants)
#     print(missedTrials_durations)
#     print(missedTrials_sequences)

    
    return input_data


In [4]:
path = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/MX_fS_VE_MTurk/data_125'

In [5]:
sequences_path = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/MX_fS_VE_MTurk/jsons'

sequences_count_dict = {}
for seq in os.listdir(sequences_path):
    if 'VE125' in seq:
        sequences_count_dict['jsons/'+seq] = 0


In [6]:
catch_trial_cleaned_data = catchTrial_cleaning(path, 4, sequences_count_dict)

Number of participants before cleaning:  733
1
3
2
0
0
0
1
2
3
0
2
0
0
3
2
0
3
1
0
1
0
0
2
3
1
1
2
1
3
0
1
3
2
2
3
1
0
0
2
1
3
2
1
3
3
0
0
0
3
3
0
3
0
2
1
1
3
3
1
0
1
1
3
3
2
3
3
3
2
0
2
3
3
3
0
1
3
2
0
3
2
0
0
2
3
3
1
3
2
2
2
0
3
3
2
3
1
2
1
0
2
1
0
3
2
3
Number of participants that did not pass the catch trial check: 106
Participants that were removed: [455899, 313957, 891218, 669007, 434211, 426912, 150061, 476720, 983302, 781281, 407894, 441932, 628935, 287953, 127653, 990130, 180312, 385787, 107172, 189455, 746171, 853823, 231949, 412500, 286799, 149874, 989279, 474890, 577026, 655411, 967606, 401339, 275128, 985512, 494791, 146352, 296801, 301416, 882924, 354107, 435676, 177475, 540574, 966193, 408375, 568594, 226873, 266280, 586958, 913805, 876498, 968474, 729588, 500033, 359091, 797955, 348720, 702231, 747017, 822880, 182937, 764914, 300757, 737362, 896209, 691000, 527716, 678104, 724538, 703554, 397914, 529019, 636507, 892017, 155073, 912132, 339174, 955174, 327643, 857330, 51

In [7]:
# add the image name as a column in the df 
catch_trial_cleaned_data['imageName'] = catch_trial_cleaned_data.apply(lambda row: row.stimulus.split('/')[1].split('_')[0], axis = 1)


In [8]:
missed_trial_cleaned_data = removeMissedTrials(catch_trial_cleaned_data, num_trials)

Number of participants with 10% or more missed trials:  138


In [9]:
def RT_Cleaning(df, outlier_range, num_trials):
    #List unique values in the df['subjID'] column
    all_subjIDs = df.subjID.unique()
    
    remove = []
    df2_list = []
    for subj in all_subjIDs:
        count = 0
        subj_df = df.loc[df['subjID'] == subj]
        cleaned_subj_df = subj_df.copy(deep=True) # prevent setting with copy warning 
        # calculate subject's average trial RT 
        average_trial_RT = subj_df["trial_RT"].mean()
        std_trial_RT = subj_df["trial_RT"].std()

        for idx, row in subj_df.iterrows():
            RT = row["trial_RT"]
            if RT < outlier_range[0]: # outlier
                cleaned_subj_df.drop([idx], inplace=True)
                count += 1
#                 print(RT)
            if RT > outlier_range[1]:
                cleaned_subj_df.drop([idx], inplace=True)
                count += 1
#                 print(RT)
                
        threshold = math.floor(num_trials * 0.1)
        if count >= threshold:
            remove.append(subj)
        
        df2_list.append(cleaned_subj_df)
    
    df2 = pd.concat(df2_list)
            
    print("Number of Participants with 10% or more trials outside their RT range: ", len(remove))
    
#     for index, row in df2.iterrows():
#         if row['subjID'] in remove:
#             df2.drop(index, inplace=True)
            
    for subj in remove:
        df2.drop(df2[df2['subjID'] == subj].index, inplace = True) 
        print(subj)
                
    return df2

In [10]:
RT_cleaned_data = RT_Cleaning(missed_trial_cleaned_data, [250, 10000], num_trials)

Number of Participants with 10% or more trials outside their RT range:  22
980451
360003
732060
793348
661457
113884
259208
930635
929565
189929
226161
827902
710832
489331
243355
982221
732544
744970
662595
567758
373801
951415


In [11]:
def repeatResponses_Cleaning(df, min_unique_responses):
    """
    Some participants gave'junk data' - same number repeated for many trials 
    Count the frequency of unique responses entered by the participant. 
    """
    #List unique values in the df['subjID'] column
    all_subjIDs = df.subjID.unique()
    
    remove = []
    max_repeats_distribution = []
    num_unique_responses_distribution = []
    for subj in all_subjIDs:
        subj_df = df.loc[df['subjID'] == subj]
        # ideally, the max repeats and num_unique_responses should be ~ 48 since there are 48 imgs at each depth bin 
        count_depth_estimates = subj_df['depth_estimate'].value_counts()
        num_unique_responses = len(count_depth_estimates)
        num_unique_responses_distribution.append(num_unique_responses)
        max_repeats = count_depth_estimates.max()
        max_repeats_distribution.append(max_repeats)
        if num_unique_responses < min_unique_responses:
#             print(num_unique_responses)
            remove.append(subj)
    print('Number of participants with less than 6 unique responses:', len(remove))
    
    avg_max_repeats = np.array(max_repeats_distribution).mean()
    std_max_repeats = np.array(max_repeats_distribution).std()
    
    for subj in all_subjIDs:
        subj_df = df.loc[df['subjID'] == subj]
        count_depth_estimates = subj_df['depth_estimate'].value_counts()
        max_repeats = count_depth_estimates.max()

        outlierrange = [avg_max_repeats - (3*std_max_repeats), avg_max_repeats + (3*std_max_repeats)]
        if max_repeats < outlierrange[0]:
            if subj not in remove:
                remove.append(subj)
                print(True)
        if max_repeats > outlierrange[1]:
            if subj not in remove:
                remove.append(subj)

    print("Number of total participants removed: repeat responses: ", len(remove))
    
#     for index, row in df.iterrows():
#         if row['subjID'] in remove:
#             df.drop(index, inplace=True)
            
    for subj in remove:
        df.drop(df[df['subjID'] == subj].index, inplace = True) 

    
    return df, max_repeats_distribution, num_unique_responses_distribution



In [12]:
repeat_resp_cleaned_data, max_repeats_distrib, num_unique_distrib = repeatResponses_Cleaning(RT_cleaned_data, 6)

Number of participants with less than 6 unique responses: 108
Number of total participants removed: repeat responses:  109


In [13]:
def finalTrialCountCheck(df, num_trials):
    """
    If more then 10% of a participants data is missing, remove the participant
    """
    #List unique values in the df['subjID'] column
    all_subjIDs = df.subjID.unique()
    
    remove = []
    for subj in all_subjIDs:
        subj_df = df.loc[df['subjID'] == subj]
        count_trials = len(subj_df.index)
        threshold_trials_remaining = num_trials - math.floor(num_trials * 0.1)

        if count_trials <= threshold_trials_remaining:
            remove.append(subj)
            
    print("Number of Participants with >= 10% trials removed: ", len(remove))

            
    for subj in remove:
        df.drop(df[df['subjID'] == subj].index, inplace = True) 
                    
    print("Number of participants left: ",len(df.subjID.unique()))
    return df

In [14]:
cleaned_data = finalTrialCountCheck(repeat_resp_cleaned_data, num_trials)

Number of Participants with >= 10% trials removed:  34
Number of participants left:  324


In [403]:
# for subj in cleaned_data.subjID.unique():
#     subj_df = cleaned_data.loc[cleaned_data['subjID']==subj]
#     print(subj_df.sequenceName.unique())

In [404]:
final_data = cleaned_data.copy(deep=True)

In [405]:
len(final_data.sequenceName.unique())

324

### Create sequence sampling file: 125 ms ONLY

In [300]:
jsons_dir = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/MX_fS_VE_MTurk/jsons'

file_dest = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/MX_fS_VE_MTurk/sequence_tracking/'

sequence_sampling_dict = {}
for seq_name in os.listdir(jsons_dir):
    if 'VE125' in seq_name:
        sequence_sampling_dict[seq_name] = []

# Convert and write JSON object to file
v0_filename = 'v0_MX_125_master_sequence_tracking.json'
with open(file_dest + v0_filename, "w") as outfile: 
    json.dump(sequence_sampling_dict, outfile)

In [301]:
print(len(sequence_sampling_dict.keys()))

324


## Update Sequence Sampling 


In [325]:
# set the version for the sequence tracking 

prev_version = 'v10'
new_version = 'v11'

# select path for the last previous sequence tracking file 

sequence_sampling_path = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/MX_fS_VE_MTurk/sequence_tracking/'+ prev_version + '_MX_125_master_sequence_tracking.json'
print(sequence_sampling_path)
# Opening JSON file
f = open(sequence_sampling_path)
  
# returns JSON object as a dictionary
sequence_sampling = json.load(f)

# sequence_sampling

/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/MX_fS_VE_MTurk/sequence_tracking/v10_MX_125_master_sequence_tracking.json


In [326]:
# print number of sequences that have been sampled by the previous batch
prev_sampled_count = 0
for seq in sequence_sampling:
    if len(sequence_sampling[seq]) > 0:
        prev_sampled_count += 1
print('Number of sequences previously sampled: ', prev_sampled_count)       

new_sequence_sampling = sequence_sampling
# update sequence sampling dictionary
for subj in final_data.subjID.unique():
    subj_df = final_data.loc[final_data['subjID'] == subj]
    subj_seq = subj_df.sequenceName.unique()[0].split('/')[1]
    # add subj to list for its corresponding sequence
    new_sequence_sampling[subj_seq].append(str(subj))

    
sampled_count = 0
unsampled_count = 0
for seq in new_sequence_sampling:
    if len(new_sequence_sampling[seq]) > 0:
        # remove duplicates of the same id
        new_sequence_sampling[seq] = list(set(new_sequence_sampling[seq]))
        sampled_count += 1
    else:
        unsampled_count += 1
        new_sequence_sampling[seq] = []
        
print('Number of sequences sampled now: ', sampled_count, '/', len(final_data.sequenceName.unique()))

print('Number of sequences to be sampled: ', unsampled_count)

print('Number sampled + to be sampled = 324: ', sampled_count + unsampled_count==324)


seq_track_path = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/MX_fS_VE_MTurk/sequence_tracking/'

with open(seq_track_path + new_version + "_MX_125_master_sequence_tracking.json", "w") as outfile:
    json.dump(new_sequence_sampling, outfile)

Number of sequences previously sampled:  323


Number of sequences sampled now:  324 / 323
Number of sequences to be sampled:  0
Number sampled + to be sampled = 324:  True


## Find sequences to replace

In [327]:
sequences_to_replace = []

for seq_key in new_sequence_sampling:
    if len(new_sequence_sampling[seq_key]) == 0:
        sequences_to_replace.append(seq_key)

len(sequences_to_replace), unsampled_count == len(sequences_to_replace)

(0, True)

In [406]:
sequences_to_replace = []
seq_sampling_dict = {}
for seq_name in os.listdir(jsons_dir):
    if 'VE125' in seq_name:
        seq_sampling_dict[seq_name] = []

for subj in final_data.subjID.unique():
    subj_seq = final_data.loc[final_data['subjID']==subj].sequenceName.unique()[0]
    key = subj_seq.split('/')[1]
    seq_sampling_dict[key].append(subj)

for key in seq_sampling_dict:
    if len(seq_sampling_dict[key]) < 1:
        sequences_to_replace.append(key)
        print(key)

## Create new batch variables file

In [383]:
counterbalancing_path = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/MX_fS_VE_MTurk/counterbalancing.csv'
counterbalancing_df = pd.read_csv(counterbalancing_path)
counterbalancing_df

Unnamed: 0,Path,Sampled
0,jsons/MX_seq41_1_VE125_flipped.json,0
1,jsons/MX_seq53_2_VE125_flipped.json,0
2,jsons/MX_seq24_0_VE125.json,0
3,jsons/MX_seq38_1_VE125_flipped.json,0
4,jsons/MX_seq19_2_VE125.json,0
...,...,...
319,jsons/MX_seq45_0_VE125.json,0
320,jsons/MX_seq11_2_VE125_flipped.json,0
321,jsons/MX_seq17_0_VE125.json,0
322,jsons/MX_seq42_0_VE125.json,0


## Indexing Notes

The row in the counterbalancing csv does NOT match the url fragment since the indexing includes the path row.

The url fragment is the counterbalancing df index + 1 --> this has been validated in the console log of the experiment

To backtrack from the url fragments to the corresponding row of the counterbalancing csv: row = url_fragment + 1

In [384]:
url_fragments = []
for sequence in sequences_to_replace:
    seq_p = 'jsons/' + sequence
    url_fragments.append(counterbalancing_df.index[counterbalancing_df['Path']==seq_p][0] + 1)

In [385]:
# number for the NEXT batch 
 
batch = 15

In [386]:
dest_variables_csv = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/MX_fS_VE_MTurk/batch_variables/125/'

base_url = 'http://54.235.29.9/FacialAge/BNav_EC2/DepthDuration/MX_fS_VE_MTurk/MX_fS_HTML.html#'

variables = {'experiment_url': [], 'sampled': []}

for fragment in url_fragments:
     variables['experiment_url'].append(base_url + str(fragment))
     variables['sampled'].append('unsampled')



variables_df = pd.DataFrame(variables)

In [387]:
variables_df.to_csv(dest_variables_csv + 'MX_125_variables' + '_B' + str(batch) + '.csv', index=False)


In [388]:
server_dest = '/Users/prachimahableshwarkar/Documents/GW/spatial_perception/app/'

variables_df.to_csv(server_dest + 'variables.csv', index=False)
