# Participant Exclusion based on Outlier Checks

In [143]:
import numpy as np
import os
import pandas as pd
import math
import matplotlib.pyplot as plt
from scipy.stats import sem
import json
import csv
import random

### Exclusion Criteria
 
 Pre-registered on OSF: https://osf.io/fuhpm

In [144]:
def combineCSVs(datafolder):
    """
    Combine all participant data into one pandas df
    OR 
    Create df for single participant file 
    """
    
    exclude = []
    
    #checks if path is a file
    isFile = os.path.isfile(datafolder)

    #checks if path is a directory
    
    isDirectory = os.path.isdir(datafolder)
    
    if isDirectory == True:
        data = []
        for filename in os.listdir(datafolder):
            if 'csv' in filename:
                path = datafolder + "/" + filename
                df = pd.read_csv(path, index_col=None, header=0)
                if df.experimentName.unique()=='Gabor-Discrimination':
                    if df.versionName.unique()=='v2':
                        subjID = df.subjID.unique()[0]
                        if subjID not in exclude:
                            data.append(df)

                
        input_frame = pd.concat(data, axis=0, ignore_index=True)
        
    if isFile == True:
        if 'csv' in datafolder:
            input_frame = pd.read_csv(datafolder, index_col=None, header=0)
    
    print('Number of participants before cleaning: ', len(input_frame.subjID.unique()))

 
    return input_frame

In [145]:

data_path = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/OSS_Exp2_MTurk/data'

In [146]:
input_data = combineCSVs(data_path)

Number of participants before cleaning:  294


In [147]:
num_trials = 12

In [148]:
def cleanbyPracticeTries(df, num_allowed_tries):
    all_subjIDs = df.subjID.unique()
    
    remove = []
    df2_list = []
    prac_too_many_dist = []
    for subj in all_subjIDs:
        count = 0
        subj_df = df.loc[df['subjID'] == subj]
        cleaned_subj_df = subj_df.copy(deep=True) # prevent setting with copy warning 
        
        subj_num_practice_tries = cleaned_subj_df.pracTries.unique()[0]
        
        if subj_num_practice_tries > num_allowed_tries:
            prac_too_many_dist.append(subj_num_practice_tries)
            remove.append(subj)
        else:  
            df2_list.append(cleaned_subj_df)
    
    df2 = pd.concat(df2_list)
            
    print('Number of participants with more than ' + str(num_allowed_tries) + ' practice tries:', len(remove))
    
    return df2, prac_too_many_dist
    

In [149]:
pracTries_cleaned_data, prac_dist = cleanbyPracticeTries(input_data, 3)

Number of participants with more than 3 practice tries: 18


In [150]:
len(pracTries_cleaned_data.subjID.unique())

276

In [151]:
def Accuracy_Cleaning(df, accuracy_threshold, num_trials):
    """    
    Remove participants with overall accuracy below the accuracy threshold (e.g. 0.7)
    
    Returns:
        data of participants that passed the accuracy threshold 
        list of the number of correct trials each participant got
    """
    #List unique values in the df['subjID'] column
    all_subjIDs = df.subjID.unique()
    
    remove = []
    df2_list = []
    list_trials_correct = []
    # number of participants where exclusion is because all responses were 'none'
    c = 0
    
    for subj in all_subjIDs:
        keypresses = []

        subj_df = df.loc[df['subjID'] == subj]
        cleaned_subj_df = subj_df.copy(deep=True) # prevent setting with copy warning 
        
        subj_num_correct_trials = 0
        
        acc_column = np.array(list(subj_df['accuracy']))
        sum_acc = np.sum(acc_column)
        
        
        for idx, row in subj_df.iterrows():
            trial_acc = row['accuracy']
            if trial_acc == 1:
                subj_num_correct_trials += 1
            else:
                keypresses.append(row['keyPress'])
        
        subj_acc = sum_acc/num_trials
#         print(acc_column)
#         print(subj, sum_acc, subj_acc)
                
        # minimum number of trials correct the participant must have to be included
        if subj_acc < accuracy_threshold:
#             print(subj_acc, subj)
            remove.append(subj)
#             print(len([x for x in keypresses if x == 'none']))
            if len([x for x in keypresses if x == 'none']) >= 4:
                c += 1
            
#         else:
#             print(subj_acc)
        
        list_trials_correct.append(subj_num_correct_trials)
        
        df2_list.append(cleaned_subj_df)
    
    df2 = pd.concat(df2_list)
    
    print("Number of Participants with accuracy below 70%: ", len(remove))
    
    for index, row in df2.iterrows():
        if row['subjID'] in remove:
            df2.drop(index, inplace=True)
                
    print('Number of participants that did not respond for 4 or more trials:', c)
    
    print('Number of participants left: ', len(df2.subjID.unique()))
                
    return df2, list_trials_correct, keypresses

In [152]:
Accuracy_cleaned_data, correct_trials_distribution, keypresses = Accuracy_Cleaning(pracTries_cleaned_data, 0.7, num_trials)


Number of Participants with accuracy below 70%:  35
Number of participants that did not respond for 4 or more trials: 0
Number of participants left:  241


In [153]:
def RT_Cleaning(df, outlier_range, num_trials):
    """
    Remove trials where trial RT is outside of the defined outlier range 
    
    Returns:
        dataframe with outlier RT trials removed
        list of all RTs 
    """
    #List unique values in the df['subjID'] column
    all_subjIDs = df.subjID.unique()
    print(len(all_subjIDs))
    
    remove = []
    df2_list = []
    total_RT_outliers = 0
    total = 0
    list_trialRT = []
    for subj in all_subjIDs:
        count = 0
        subj_df = df.loc[df['subjID'] == subj]
        cleaned_subj_df = subj_df.copy(deep=True) # prevent setting with copy warning 

        for idx, row in subj_df.iterrows():
            total += 1
            RT = row["RT"]
            list_trialRT.append(RT)
            if RT < outlier_range[0]: # outlier
                cleaned_subj_df.drop([idx], inplace=True)
                count += 1
                total_RT_outliers += 1
            if RT > outlier_range[1]:
                cleaned_subj_df.drop([idx], inplace=True)
                count += 1
                total_RT_outliers += 1
        
        df2_list.append(cleaned_subj_df)
    
    df2 = pd.concat(df2_list)
    print(len(df2.subjID.unique()))
                
    return df2, list_trialRT

In [154]:
RT_cleaned_data, trialRTs_distribution = RT_Cleaning(Accuracy_cleaned_data, [250, 5000], num_trials)


241
240


In [155]:
def finalTrialCountCheck(df, num_trials, min_trials):
    """
    If more then 10% of a participants data is missing, remove the participant
    """
    #List unique values in the df['subjID'] column
    all_subjIDs = df.subjID.unique()
    
    remove = []
    for subj in all_subjIDs:
        subj_df = df.loc[df['subjID'] == subj]
        count_trials = len(subj_df.index)
        if count_trials < min_trials:
            remove.append(subj)
            
#         threshold_trials_remaining = num_trials - math.floor(num_trials * 0.1)

#         if count_trials <= threshold_trials_remaining:
#             remove.append(subj)
            
    print("Number of Participants with >= 10% trials removed: ", len(remove))
            
    for subj in remove:
        df.drop(df[df['subjID'] == subj].index, inplace = True) 
                    
    print("Number of participants left: ",len(df.subjID.unique()))
    return df

In [156]:
finalTrialCount_data = finalTrialCountCheck(RT_cleaned_data, num_trials, 11)

Number of Participants with >= 10% trials removed:  18
Number of participants left:  222


In [157]:
raw_final_data = finalTrialCount_data.copy(deep=True)
len(raw_final_data.subjID.unique())

222

In [158]:
for seq in raw_final_data.sequenceName.unique():
    seq_df = raw_final_data.loc[raw_final_data['sequenceName']==seq]
    print(seq, len(seq_df.subjID.unique()))

jsons/s3_even.json 8
jsons/s0_odd.json 10
jsons/s8_even.json 10
jsons/s1_even.json 10
jsons/s4_even.json 8
jsons/s8_odd.json 9
jsons/s5_odd.json 8
jsons/s9_odd.json 11
jsons/s11_even.json 8
jsons/s4_odd.json 11
jsons/s2_odd.json 10
jsons/s1_odd.json 9
jsons/s2_even.json 8
jsons/s3_odd.json 8
jsons/s9_even.json 10
jsons/s11_odd.json 11
jsons/s10_odd.json 9
jsons/s5_even.json 9
jsons/s6_odd.json 10
jsons/s10_even.json 8
jsons/s7_even.json 11
jsons/s6_even.json 9
jsons/s7_odd.json 10
jsons/s0_even.json 8


## Create Sequence Tracking File

In [89]:
jsons_dir = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/OSS_Exp2_MTurk/jsons'

file_dest = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/OSS_Exp2_MTurk/sequence_tracking/'

sequence_sampling_dict = {}
for seq_name in os.listdir(jsons_dir):
    if '.json' in seq_name:
        sequence_sampling_dict[seq_name] = []

# Convert and write JSON object to file
v0_filename = 'b0_GD_master_sequence_tracking.json'
with open(file_dest + v0_filename, "w") as outfile: 
    json.dump(sequence_sampling_dict, outfile)

print(len(sequence_sampling_dict.keys()))

24


In [139]:
# set the version for the sequence tracking 

prev_version = 'b4'
new_version = 'b5'

# select path for the last previous sequence tracking file 

sequence_sampling_path = file_dest + prev_version + '_GD_master_sequence_tracking.json'
print(sequence_sampling_path)
# Opening JSON file
f = open(sequence_sampling_path)
  
# returns JSON object as a dictionary
sequence_sampling = json.load(f)

sequence_sampling

/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/OSS_Exp2_MTurk/sequence_tracking/b4_GD_master_sequence_tracking.json


{'s5_even.json': ['738830',
  '287865',
  '287219',
  '268532',
  '393692',
  '959033',
  '887273',
  '528857',
  '697760'],
 's8_even.json': ['589443',
  '840050',
  '676893',
  '102824',
  '659411',
  '563726',
  '232153',
  '247187'],
 's0_odd.json': ['215484',
  '830815',
  '775834',
  '447827',
  '706514',
  '308283',
  '891679',
  '830827',
  '569628',
  '896945'],
 's1_odd.json': ['209897',
  '235416',
  '533065',
  '563135',
  '641993',
  '568669',
  '931030'],
 's3_even.json': ['693191',
  '630325',
  '720363',
  '830531',
  '466141',
  '989662',
  '502903',
  '932818'],
 's11_even.json': ['992898',
  '860542',
  '395635',
  '224395',
  '601680',
  '171324',
  '975033',
  '148661'],
 's4_even.json': ['712649',
  '346283',
  '395009',
  '227570',
  '177433',
  '166575',
  '462302',
  '798676'],
 's7_odd.json': ['664257',
  '284853',
  '623609',
  '442290',
  '688024',
  '709908',
  '819964',
  '320153',
  '770423',
  '457056'],
 's6_odd.json': ['536270',
  '436808',
  '585235',

In [140]:
# print number of sequences that have been sampled by the previous batch
prev_sampled_count = 0
for seq in sequence_sampling:
    if len(sequence_sampling[seq]) > 0:
        prev_sampled_count += 1
print('Number of sequences previously sampled: ', prev_sampled_count)       

new_sequence_sampling = sequence_sampling
# update sequence sampling dictionary
for subj in raw_final_data.subjID.unique():
    subj_df = raw_final_data.loc[raw_final_data['subjID'] == subj]
    subj_seq = subj_df.sequenceName.unique()[0].split('/')[1]
    # add subj to list for its corresponding sequence
    new_sequence_sampling[subj_seq].append(str(subj))

    
sampled_count = 0
unsampled_count = 0
for seq in new_sequence_sampling:
    if len(new_sequence_sampling[seq]) > 0:
        # remove duplicates of the same id
        new_sequence_sampling[seq] = list(set(new_sequence_sampling[seq]))
        sampled_count += 1
    else:
        unsampled_count += 1
        new_sequence_sampling[seq] = []
        
# print('Number of sequences sampled now: ', sampled_count, '/', len(raw_final_data.sequenceName.unique()))

# print('Number of sequences to be sampled: ', unsampled_count)

# print('Number sampled + to be sampled = 192: ', sampled_count + unsampled_count==192)


with open(file_dest + new_version + "_GD_master_sequence_tracking.json", "w") as outfile:
    json.dump(new_sequence_sampling, outfile)

Number of sequences previously sampled:  24


In [141]:
len(new_sequence_sampling.keys()) * 8

192

In [142]:
count_to_replace = 0
for key in new_sequence_sampling:
    if len(new_sequence_sampling[key]) < 8:
        print(key, len(new_sequence_sampling[key]))
        count_to_replace += (8-len(new_sequence_sampling[key]))

count_to_replace

0

## Find Sequences to Replace

In this design, there need to be four participants per sequence (24 total sequences)

In [94]:
goal_participants_per_seq = 8
sequences_to_replace = []

total = 0
for seq_key in new_sequence_sampling:
    if len(new_sequence_sampling[seq_key]) < goal_participants_per_seq:
        needed = goal_participants_per_seq - len(new_sequence_sampling[seq_key])
        total += needed
        # print(seq_key, needed)
        for i in range(needed):
            sequences_to_replace.append(seq_key)

len(set(sequences_to_replace)),len(sequences_to_replace), total

(0, 0, 0)

## Create new variables file and new counterbalancing file

Cannot have multiple rows with the same url in the variables file right now, which is why this is necessary

### Make Counterbalancing File

In [66]:

paths = []
for json in sequences_to_replace:
    paths.append({'Path':'jsons/' + json, 'Sampled': 0})

In [67]:
# csv header
fieldnames = ['Path', 'Sampled']

# csv data
rows = paths

dest = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/OSS_Exp2_MTurk/'

with open(dest + 'counterbalancing.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(rows)

In [68]:
counterbalancing_df = pd.read_csv(dest + 'counterbalancing.csv')

counterbalancing_df

Unnamed: 0,Path,Sampled
0,jsons/s1_odd.json,0
1,jsons/s3_odd.json,0
2,jsons/s5_odd.json,0


### Make Variables File

The row in the counterbalancing csv does NOT match the url fragment since the indexing includes the path row.

The url fragment is the counterbalancing df index + 1 --> this has been validated in the console log of the experiment

To backtrack from the url fragments to the corresponding row of the counterbalancing csv: row = url_fragment + 1

In [69]:
url_fragments = []
for i in range(len(sequences_to_replace)):
    url_fragments.append(i + 1)

# should be equal
len(url_fragments), len(set(url_fragments))

(3, 3)

In [70]:
# number for the NEXT batch 
batch = 4

In [71]:
dest_variables_csv = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/OSS_Exp2_MTurk/batch_variables/'

base_url = 'http://54.235.29.9/FacialAge/OSS_Exp2_MTurk/Gab_OSS_HTML.html#'

variables = {'experiment_url': [], 'sampled': []}

for fragment in url_fragments:
     variables['experiment_url'].append(base_url + str(fragment))
     variables['sampled'].append('unsampled')

variables_df = pd.DataFrame(variables)

In [72]:
variables_df

Unnamed: 0,experiment_url,sampled
0,http://54.235.29.9/FacialAge/OSS_Exp2_MTurk/Ga...,unsampled
1,http://54.235.29.9/FacialAge/OSS_Exp2_MTurk/Ga...,unsampled
2,http://54.235.29.9/FacialAge/OSS_Exp2_MTurk/Ga...,unsampled


In [73]:
variables_df.to_csv(dest_variables_csv + 'gab-discrim-variables' + '-B' + str(batch) + '.csv', index=False)

server_dest = '/Users/prachimahableshwarkar/Documents/GW/spatial_perception/app/'

variables_df.to_csv(server_dest + 'variables.csv', index=False)