# Process and Save Cleaned Data

In [1]:
import os 
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import scipy 
import scipy.stats as stats
from scipy import stats
from statsmodels.stats.anova import AnovaRM
import copy
import datetime

In [2]:
num_trials = 54

In [3]:
def combineCSVs(datafolder):
    """
    Args:
        datafolder = path to data 
    Returns:
        df of all participant data 
        OR
        df for singlle participant
    """
        
    #checks if path is a file
    isFile = os.path.isfile(datafolder)

    #checks if path is a directory
    
    isDirectory = os.path.isdir(datafolder)
    
    if isDirectory == True:
        data = []
        for filename in os.listdir(datafolder):
            if 'csv' in filename:
                path = datafolder + "/" + filename
                df = pd.read_csv(path, index_col=None, header=0)
                if df.experimentName.unique()=='final-intermixed-textured':
                    if 'VE1000' in df.sequenceName.unique()[0]:
                        subjID = df.subjID.unique()[0]
                        data.append(df)

        input_frame = pd.concat(data, axis=0, ignore_index=True)
        
    if isFile == True:
        if 'csv' in datafolder:
            input_frame = pd.read_csv(datafolder, index_col=None, header=0)
    
    print('Number of participants before cleaning: ', len(input_frame.subjID.unique()))

 
    return input_frame


def feet_to_meters(ft):
    """
    Args: 
        ft = float value in feet 
        
    returns:
        m = float value converted to meters 
    """
    m = ft * 0.3048
    return m

def getUnitConveredData(datafolder):
    '''
    Args: 
        datafolder = path to data  
        
    returns:
        df with all estimates converted to meters      
    '''
    input_data = combineCSVs(datafolder) # combine CSVs from all participants 
    
    for idx, row in input_data.iterrows():
        unit = row['unitSelection']
        # if estimate was made in feet, convert to meters 
        if unit == 'feet':
            estim_ft = row['depth_estimate']
            estim_m = feet_to_meters(estim_ft)
            # update depth estimates in existing dataframe
            input_data.at[idx, 'depth_estimate'] = estim_m

    
    return input_data

def cleanAgeResponses(datafolder):
    '''
    Args: 
        datafolder = path to data  
        
    returns:
        df with cleaned reported age 
    '''
    input_data = getUnitConveredData(datafolder)
    
    for idx, row in input_data.iterrows():
        age = row['age']
        # if year of birth was given, convert to age
        today = datetime.date.today()
        year = today.year
        if age > 2000:
            actual_age = year-age
            # update age in existing dataframe
            input_data.at[idx, 'age'] = actual_age
            print(row['subjID'])
            print(actual_age, age)


    return input_data    
 

def catchTrial_cleaning(path, correct_requirement, sequence_count):
    '''
    Participants complete 6 catch trials total to ensure that they are doing the task.
    If less than 4/6 catch trials are correct, the participant is excluded.  
    '''
    
    df = cleanAgeResponses(path)
    
    all_subjIDs = df.subjID.unique()
    remove = []
    subj_sequence = {}
    df2_list = []
    
    for subj in all_subjIDs:
#         print(subj)
        subj_df = df.loc[df['subjID'] == subj]
        cleaned_subj_df = subj_df.copy(deep=True) # prevent setting with copy warning
        subj_sequence[subj] = subj_df.sequenceName.unique()[0]
        
        count_correct = 0
        for idx, row in subj_df.iterrows():
            stim = row['stimulus']
            if stim.split('/')[0] == 'catch_stimuli':
                ####### VERSION WHERE CATCH TRIALS ARE ATTENTION CHECK: IMAGE HAS NO TARGET
                if row["depth_estimate"] == 0:
                    count_correct += 1

                # remove catch trial 
                cleaned_subj_df.drop([idx], inplace=True)

        if count_correct < correct_requirement:
            remove.append(subj)
            print(count_correct)
        else:
            sequence_count[subj_df.sequenceName.unique()[0]] += 1
        
        df2_list.append(cleaned_subj_df)
    
    df2 = pd.concat(df2_list)
    print("Number of participants that did not pass the catch trial check:", len(remove))
    print("Participants that were removed:",remove)
            
    for subj in remove:
        df2.drop(df2[df2['subjID'] == subj].index, inplace = True) 
    
    return df2
    

def removeMissedTrials(input_data, num_trials):
    """
    Participants were told that if they missed a trial, to respond '0'.
    This function removes those trials, and keeps track of:
    (1) How many missed trials per participant
    (2) Number of missed trials per duration 
    (3) Number of missed trials per sequence 
    """
#     input_data = cleanAgeResponses(datafolder)
    
    missedTrials_participants = {}
    missedTrials_durations = {}
    missedTrials_sequences = {}
    
    
    for idx, row in input_data.iterrows():
        estimate = row['depth_estimate']
        if estimate == 0.0:
            subjID = row['subjID']
            duration = row['duration']
            sequenceName = row['sequenceName']
            
            if subjID not in missedTrials_participants:
                missedTrials_participants[subjID] = 1
            else:
                missedTrials_participants[subjID] += 1

            if duration not in missedTrials_durations:
                missedTrials_durations[duration] = 1
            else:
                missedTrials_durations[duration] += 1
            
            if sequenceName not in missedTrials_sequences:
                missedTrials_sequences[sequenceName] = 1
            else:
                missedTrials_sequences[sequenceName] += 1
            
#             print(subjID, duration, sequenceName)
            
            # remove trials with depth estimate = 0 
            input_data.drop(idx, inplace=True)
    
    # remove participants data if the participant's missed trial count is 10% or more of num_trials
    threshold = math.floor(num_trials * 0.1)
#     print("Missing Trial Count Threshold: ", threshold)
    remove_ids = []
    for key in missedTrials_participants:
        if missedTrials_participants[key] >= threshold:
            remove_ids.append(key)
    print("Number of participants with 10% or more missed trials: ", len(remove_ids))
            
    for subj in remove_ids:
        input_data.drop(input_data[input_data['subjID'] == subj].index, inplace = True) 

    # Note if a particular participant, duration, or sequence has maximum missing trials
    # ** If the participant had no missed trials, then ID will not show up in dict 
#     print("Missed Trials")
#     print(missedTrials_participants)
#     print(missedTrials_durations)
#     print(missedTrials_sequences)

    
    return input_data


In [5]:
path = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/MX_fS_VE_MTurk/data_1000'

In [6]:
sequences_path = '/Users/prachimahableshwarkar/Documents/GW/FacialAge/FacialAge_MTurk/BNav_EC2/DepthDuration/MX_fS_VE_MTurk/jsons'

sequences_count_dict = {}
for seq in os.listdir(sequences_path):
    if 'VE1000' in seq:
        sequences_count_dict['jsons/'+seq] = 0


In [7]:
catch_trial_cleaned_data = catchTrial_cleaning(path, 4, sequences_count_dict)

Number of participants before cleaning:  434
0
2
3
Number of participants that did not pass the catch trial check: 3
Participants that were removed: [779583, 728446, 667592]


In [8]:
# add the image name as a column in the df 
catch_trial_cleaned_data['imageName'] = catch_trial_cleaned_data.apply(lambda row: row.stimulus.split('/')[1].split('_')[0], axis = 1)


In [9]:
missed_trial_cleaned_data = removeMissedTrials(catch_trial_cleaned_data, num_trials)

Number of participants with 10% or more missed trials:  21


In [10]:
def RT_Cleaning(df, outlier_range, num_trials):
    #List unique values in the df['subjID'] column
    all_subjIDs = df.subjID.unique()
    
    remove = []
    df2_list = []
    for subj in all_subjIDs:
        count = 0
        subj_df = df.loc[df['subjID'] == subj]
        cleaned_subj_df = subj_df.copy(deep=True) # prevent setting with copy warning 
        # calculate subject's average trial RT 
        average_trial_RT = subj_df["trial_RT"].mean()
        std_trial_RT = subj_df["trial_RT"].std()

        for idx, row in subj_df.iterrows():
            RT = row["trial_RT"]
            if RT < outlier_range[0]: # outlier
                cleaned_subj_df.drop([idx], inplace=True)
                count += 1
#                 print(RT)
            if RT > outlier_range[1]:
                cleaned_subj_df.drop([idx], inplace=True)
                count += 1
#                 print(RT)
                
        threshold = math.floor(num_trials * 0.1)
        if count >= threshold:
            remove.append(subj)
        
        df2_list.append(cleaned_subj_df)
    
    df2 = pd.concat(df2_list)
            
    print("Number of Participants with 10% or more trials outside their RT range: ", len(remove))
    
#     for index, row in df2.iterrows():
#         if row['subjID'] in remove:
#             df2.drop(index, inplace=True)
            
    for subj in remove:
        df2.drop(df2[df2['subjID'] == subj].index, inplace = True) 
        print(subj)
                
    return df2

In [11]:
RT_cleaned_data = RT_Cleaning(missed_trial_cleaned_data, [250, 10000], num_trials)

Number of Participants with 10% or more trials outside their RT range:  46
208835
815873
806885
242290
196862
527411
933118
450327
546987
906353
530371
589274
545695
784848
778635
291051
672462
209842
331115
670523
472642
910910
386642
348842
516929
811476
347207
209320
455535
328794
898096
229024
716907
151804
669687
767364
629621
307932
783637
444494
295403
872205
604916
272069
742321
616166


In [12]:
def repeatResponses_Cleaning(df, min_unique_responses):
    """
    Some participants gave'junk data' - same number repeated for many trials 
    Count the frequency of unique responses entered by the participant. 
    """
    #List unique values in the df['subjID'] column
    all_subjIDs = df.subjID.unique()
    
    remove = []
    max_repeats_distribution = []
    num_unique_responses_distribution = []
    for subj in all_subjIDs:
        subj_df = df.loc[df['subjID'] == subj]
        # ideally, the max repeats and num_unique_responses should be ~ 48 since there are 48 imgs at each depth bin 
        count_depth_estimates = subj_df['depth_estimate'].value_counts()
        num_unique_responses = len(count_depth_estimates)
        num_unique_responses_distribution.append(num_unique_responses)
        max_repeats = count_depth_estimates.max()
        max_repeats_distribution.append(max_repeats)
        if num_unique_responses < min_unique_responses:
#             print(num_unique_responses)
            remove.append(subj)
    print('Number of participants with less than 6 unique responses:', len(remove))
    
    avg_max_repeats = np.array(max_repeats_distribution).mean()
    std_max_repeats = np.array(max_repeats_distribution).std()
    
    for subj in all_subjIDs:
        subj_df = df.loc[df['subjID'] == subj]
        count_depth_estimates = subj_df['depth_estimate'].value_counts()
        max_repeats = count_depth_estimates.max()

        outlierrange = [avg_max_repeats - (3*std_max_repeats), avg_max_repeats + (3*std_max_repeats)]
        if max_repeats < outlierrange[0]:
            if subj not in remove:
                remove.append(subj)
                print(True)
        if max_repeats > outlierrange[1]:
            if subj not in remove:
                remove.append(subj)

    print("Number of total participants removed: repeat responses: ", len(remove))
    
#     for index, row in df.iterrows():
#         if row['subjID'] in remove:
#             df.drop(index, inplace=True)
            
    for subj in remove:
        df.drop(df[df['subjID'] == subj].index, inplace = True) 

    
    return df, max_repeats_distribution, num_unique_responses_distribution



In [13]:
repeat_resp_cleaned_data, max_repeats_distrib, num_unique_distrib = repeatResponses_Cleaning(RT_cleaned_data, 6)

Number of participants with less than 6 unique responses: 29
Number of total participants removed: repeat responses:  29


In [14]:
def finalTrialCountCheck(df, num_trials):
    """
    If more then 10% of a participants data is missing, remove the participant
    """
    #List unique values in the df['subjID'] column
    all_subjIDs = df.subjID.unique()
    
    remove = []
    for subj in all_subjIDs:
        subj_df = df.loc[df['subjID'] == subj]
        count_trials = len(subj_df.index)
        threshold_trials_remaining = num_trials - math.floor(num_trials * 0.1)

        if count_trials <= threshold_trials_remaining:
            remove.append(subj)
            
    print("Number of Participants with >= 10% trials removed: ", len(remove))

            
    for subj in remove:
        df.drop(df[df['subjID'] == subj].index, inplace = True) 
                    
    print("Number of participants left: ",len(df.subjID.unique()))
    return df

In [15]:
cleaned_data = finalTrialCountCheck(repeat_resp_cleaned_data, num_trials)

Number of Participants with >= 10% trials removed:  11
Number of participants left:  324


In [16]:
final_data = cleaned_data.copy(deep=True)

In [17]:
len(final_data.sequenceName.unique())

324

In [18]:
all_stds = []
for subj in final_data.subjID.unique():
    subj_df = final_data.loc[final_data['subjID']==subj]
    avg_response = subj_df['depth_estimate'].mean()
    std_response = subj_df['depth_estimate'].std()
    all_stds.append(std_response)
    # if avg_response > 30:
    #     print(avg_response, subj)

avg_stds = np.mean(np.array(all_stds))
std_stds = np.std(np.array(all_stds))

In [19]:
values_to_drop = []
def is_outside_X_std(numbers, number_to_check, X):
    mean = sum(numbers) / len(numbers)
    std_dev = (sum((x - mean) ** 2 for x in numbers) / len(numbers)) ** 0.5
    lower_bound = mean - X * std_dev
    upper_bound = mean + X * std_dev
    return number_to_check < lower_bound or number_to_check > upper_bound

for subj in final_data.subjID.unique():
    subj_df = final_data.loc[final_data['subjID']==subj]
    subj_std = subj_df['depth_estimate'].std()
    if is_outside_X_std(all_stds, subj_std, 3):
        values_to_drop.append(subj)
        print(subj)
    else:
        pass
        # print(f"The subject {subj} is inside the range of mean ± 3 standard deviations.")


713930
894847
830607


In [20]:


# Remove rows where the 'ID' column is in the list of values_to_drop
df_filtered = final_data[~final_data['subjID'].isin(values_to_drop)]

## Z-Score Outcomes

In [21]:
def zscored_outcomes(df):
    '''
    z-score depth estimates and RTs:
        for each subj calculate their avg and std 
        zscored = (estim - subj avg)/subj std
    '''
    #List unique values in the df['subjID'] column
    all_subjIDs = df.subjID.unique()
    
    df2_list = []
    for subj in all_subjIDs:
        subj_df = df.loc[df['subjID'] == subj]
        final_subj_df = subj_df.copy(deep=True) # prevent setting with copy warning 
        
        # Z-Score depth estimates
        average_estim = subj_df["depth_estimate"].mean()
        std_estim = subj_df["depth_estimate"].std()
        subj_depth_estimates = np.array(list(subj_df["depth_estimate"]))
        zscored_subj_depth_estimates = (subj_depth_estimates - average_estim)/std_estim

        final_subj_df['zs_depth_estimates'] = zscored_subj_depth_estimates

        # Z-Score actual depth
        average_AD = subj_df["actual_depth"].mean()
        std_AD = subj_df["actual_depth"].std()
        subj_AD = np.array(list(subj_df["actual_depth"]))
        zscored_subj_AD = (subj_AD - average_AD)/std_AD

        final_subj_df['zs_actual_depth'] = zscored_subj_AD

        # Z-Score RT
        average_RT = subj_df["trial_RT"].mean()
        std_RT = subj_df["trial_RT"].std()
        subj_RTs = np.array(list(subj_df["trial_RT"]))
        zscored_subj_RTs = (subj_RTs - average_RT)/std_RT

        final_subj_df['zs_trial_RT'] = zscored_subj_RTs
        df2_list.append(final_subj_df)
    
    df2 = pd.concat(df2_list)    

    return df2

In [22]:
zscored_final_data = zscored_outcomes(final_data)

filtered_zscored_final_data = zscored_outcomes(df_filtered)

In [23]:
# Create a new column 'condition' based on the content of 'stimulus'
zscored_final_data['condition'] = zscored_final_data['stimulus'].apply(lambda x: 'BC' if 'BC' in x else 'FS')

# Create a new column 'condition' based on the content of 'stimulus'
filtered_zscored_final_data['condition'] = filtered_zscored_final_data['stimulus'].apply(lambda x: 'BC' if 'BC' in x else 'FS')

In [24]:
dest = '/Users/prachimahableshwarkar/Documents/GW/Depth_MTurk/familiarSize/data/'
label = 'tx-MX-data-1000ms.csv'

zscored_final_data.to_csv(dest + label , index=True)

filtered_zscored_final_data.to_csv(dest + 'filtered-'+label, index=True)
