# Participant Exclusion

In [83]:
import os
import json 
import pandas as pd 
import numpy as np 
import math

In [84]:
def combineCSVs(datafolder):
    """
    Combine all participant data into one pandas df
    OR 
    Create df for single participant file 
    """
    
    exclude = []
    
    #checks if path is a file
    isFile = os.path.isfile(datafolder)

    #checks if path is a directory
    
    isDirectory = os.path.isdir(datafolder)
    
    if isDirectory == True:
        data = []
        for filename in os.listdir(datafolder):
            if 'csv' in filename:
                path = datafolder + "/" + filename
                df = pd.read_csv(path, index_col=None, header=0)
                
                # do NOT include subject IDs that have been flagged 
                subjID = df.subjID.unique()[0]
                if subjID not in exclude:
                    data.append(df)

                
        input_frame = pd.concat(data, axis=0, ignore_index=True)
        
    if isFile == True:
        if 'csv' in datafolder:
            input_frame = pd.read_csv(datafolder, index_col=None, header=0)
    
    print('Number of participants before cleaning: ', len(input_frame.subjID.unique()))

 
    return input_frame

In [85]:
# data_path = '/Users/prachi/Documents/object_scene_scaling_data/pilot2'

data_path = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/OSS_MTurk/data'

In [86]:
input_data = combineCSVs(data_path)

Number of participants before cleaning:  114


In [87]:
# input_data

In [88]:
num_trials = 24

In [89]:
def cleanbyPracticeTries(df, num_allowed_tries):
    exclude = []
    all_subjIDs = df.subjID.unique()
    
    remove = []
    df2_list = []
    prac_too_many_dist = []
    for subj in all_subjIDs:
        count = 0
        subj_df = df.loc[df['subjID'] == subj]
        cleaned_subj_df = subj_df.copy(deep=True) # prevent setting with copy warning 
        
        subj_num_practice_tries = cleaned_subj_df.pracTries.unique()[0]
        
        if subj_num_practice_tries > num_allowed_tries:
            prac_too_many_dist.append(subj_num_practice_tries)
            remove.append(subj)
        else:  
            df2_list.append(cleaned_subj_df)
    
    df2 = pd.concat(df2_list)
            
    print('Number of participants with more than ' + str(num_allowed_tries) + ' practice tries:', len(remove))
    
    exclude += remove 
    
    return df2, prac_too_many_dist, exclude
    

In [90]:
pracTries_cleaned_data, prac_dist, exclude = cleanbyPracticeTries(input_data, 3)

Number of participants with more than 3 practice tries: 4


In [91]:
len(pracTries_cleaned_data.subjID.unique())

110

In [92]:
def Accuracy_Cleaning(df, accuracy_threshold, num_trials, exclude):
    """    
    Remove participants with overall accuracy below the accuracy threshold (e.g. 0.7)
    
    Returns:
        data of participants that passed the accuracy threshold 
        list of the number of correct trials each participant got
    """
    #List unique values in the df['subjID'] column
    all_subjIDs = df.subjID.unique()
    
    remove = []
    df2_list = []
    list_trials_correct = []
    # number of participants where exclusion is because all responses were 'none'
    c = 0
    
    for subj in all_subjIDs:
        keypresses = []

        subj_df = df.loc[df['subjID'] == subj]
        cleaned_subj_df = subj_df.copy(deep=True) # prevent setting with copy warning 
        
        subj_num_correct_trials = 0
        
        acc_column = np.array(list(subj_df['accuracy']))
        sum_acc = np.sum(acc_column)
        
        
        for idx, row in subj_df.iterrows():
            trial_acc = row['accuracy']
            if trial_acc == 1:
                subj_num_correct_trials += 1
            else:
                keypresses.append(row['keyPress'])
        
        subj_acc = sum_acc/num_trials
#         print(acc_column)
#         print(subj, sum_acc, subj_acc)
                
        # minimum number of trials correct the participant must have to be included
        if subj_acc < accuracy_threshold:
#             print(subj_acc, subj)
            remove.append(subj)
#             print(len([x for x in keypresses if x == 'none']))
            if len([x for x in keypresses if x == 'none']) >= 4:
                c += 1
            
#         else:
#             print(subj_acc)
        
        list_trials_correct.append(subj_num_correct_trials)
        
        df2_list.append(cleaned_subj_df)
    
    df2 = pd.concat(df2_list)
    
    print("Number of Participants with accuracy below 70%: ", len(remove))
    
    for index, row in df2.iterrows():
        if row['subjID'] in remove:
            df2.drop(index, inplace=True)
                
    print('Number of participants that did not respond for 4 or more trials:', c)
    
    print('Number of participants left: ', len(df2.subjID.unique()))
    
    exclude += remove
                
    return df2, list_trials_correct, keypresses, exclude

In [93]:
Accuracy_cleaned_data, correct_trials_distribution, keypresses, exclude = Accuracy_Cleaning(pracTries_cleaned_data, 0.7, num_trials, exclude)


Number of Participants with accuracy below 70%:  4
Number of participants that did not respond for 4 or more trials: 2
Number of participants left:  106


In [94]:
def RT_Cleaning(df, outlier_range, num_trials, exclude):
    """
    Remove trials where trial RT is outside of the defined outlier range 
    
    Returns:
        dataframe with outlier RT trials removed
        list of all RTs 
    """
    #List unique values in the df['subjID'] column
    all_subjIDs = df.subjID.unique()
    print(len(all_subjIDs))
    
    remove = []
    df2_list = []
    total_RT_outliers = 0
    total = 0
    list_trialRT = []
    for subj in all_subjIDs:
        count = 0
        subj_df = df.loc[df['subjID'] == subj]
        cleaned_subj_df = subj_df.copy(deep=True) # prevent setting with copy warning 

        for idx, row in subj_df.iterrows():
            total += 1
            RT = row["RT"]
            list_trialRT.append(RT)
            if RT < outlier_range[0]: # outlier
                cleaned_subj_df.drop([idx], inplace=True)
                count += 1
                total_RT_outliers += 1
            if RT > outlier_range[1]:
                cleaned_subj_df.drop([idx], inplace=True)
                count += 1
                total_RT_outliers += 1
        
        df2_list.append(cleaned_subj_df)
    
    df2 = pd.concat(df2_list)
    print(len(df2.subjID.unique()))
    
    exclude += remove
                
    return df2, list_trialRT, exclude

In [95]:
RT_cleaned_data, trialRTs_distribution, exclude = RT_Cleaning(Accuracy_cleaned_data, [250, 5000], num_trials, exclude)


106
106


In [96]:
def finalTrialCountCheck(df, num_trials, min_trials, exclude):
    """
    If more then 10% of a participants data is missing, remove the participant
    """
    #List unique values in the df['subjID'] column
    all_subjIDs = df.subjID.unique()
    
    remove = []
    for subj in all_subjIDs:
        subj_df = df.loc[df['subjID'] == subj]
        count_trials = len(subj_df.index)
        if count_trials < min_trials:
            remove.append(subj)
            
#         threshold_trials_remaining = num_trials - math.floor(num_trials * 0.1)

#         if count_trials <= threshold_trials_remaining:
#             remove.append(subj)
            
    print("Number of Participants with >= 10% trials removed: ", len(remove))
            
    for subj in remove:
        df.drop(df[df['subjID'] == subj].index, inplace = True) 
                    
    print("Number of participants left: ",len(df.subjID.unique()))
    
    exclude += remove 
    return df, exclude

In [97]:
finalTrialCount_data, exclude = finalTrialCountCheck(RT_cleaned_data, num_trials, 22, exclude)

Number of Participants with >= 10% trials removed:  10
Number of participants left:  96


In [98]:
raw_final_data = finalTrialCount_data.copy(deep=True)
len(raw_final_data.subjID.unique())

96

In [99]:
seqs = raw_final_data.sequenceName.unique()
seq_track = {}
for seq in seqs:
    sq_df = raw_final_data.loc[raw_final_data['sequenceName']==seq]
    seq_track[seq] = len(sq_df.subjID.unique())

seq_track

{'jsons/s7.json': 4,
 'jsons/s2.json': 4,
 'jsons/s13.json': 4,
 'jsons/s1.json': 4,
 'jsons/s17.json': 4,
 'jsons/s4.json': 4,
 'jsons/s12.json': 4,
 'jsons/s3.json': 4,
 'jsons/s0.json': 4,
 'jsons/s19.json': 4,
 'jsons/s23.json': 4,
 'jsons/s20.json': 4,
 'jsons/s21.json': 4,
 'jsons/s14.json': 4,
 'jsons/s6.json': 4,
 'jsons/s10.json': 4,
 'jsons/s5.json': 4,
 'jsons/s8.json': 4,
 'jsons/s18.json': 4,
 'jsons/s9.json': 4,
 'jsons/s22.json': 4,
 'jsons/s11.json': 4,
 'jsons/s16.json': 4,
 'jsons/s15.json': 4}

In [76]:
sequences_to_replace = ['jsons/s10.json', 'jsons/s11.json']

In [77]:
counterbalancing_path = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/OSS_MTurk/counterbalancing.csv'
counterbalancing_df = pd.read_csv(counterbalancing_path)
counterbalancing_df

Unnamed: 0,Path
0,jsons/s0.json
1,jsons/s1.json
2,jsons/s2.json
3,jsons/s3.json
4,jsons/s4.json
5,jsons/s5.json
6,jsons/s6.json
7,jsons/s7.json
8,jsons/s8.json
9,jsons/s9.json


In [78]:
sequences_to_replace[0]

'jsons/s10.json'

In [79]:
url_fragments = []
for sequence in sequences_to_replace:
    url_fragments.append(counterbalancing_df.index[counterbalancing_df['Path']==sequence][0] + 1)

In [80]:
# number for the NEXT batch 
batch = 3

In [81]:
dest_variables_csv = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/OSS_MTurk/batch_variables/'

base_url = 'http://54.235.29.9/FacialAge/OSS_MTurk/OSS_HTML_e5v2.html#'

variables = {'experiment_url': []}

for fragment in url_fragments:
     variables['experiment_url'].append(base_url + str(fragment))

variables_df = pd.DataFrame(variables)

In [82]:
variables_df.to_csv(dest_variables_csv + 'e5v2' + '_b' + str(batch) + '.csv', index=False)
