Preparation of participants' data for the analysis of response times (first cell).

*1) Reading participants' files*\
*2) Checking their overall accuracy*\
*3) Defining outliers (2.5 standard deviations above or below the mean) per condition per participant*

In [1]:
import pandas as pd

# All of the subjects' files with their data are read into dict_all_participants, where the overall accuracy per participant is checked.
path =  'path_to_your_data'

dict_all_participant = {}
number_of_participants = 49

for i in range(number_of_participants):
    filename = "subject-" + str(i) + ".csv"
    df = pd.read_csv(path + filename)
    dict_all_participant["df" + str(i + 1)] = df

# Our dict_participants_sufficient_accuracy contains data of participants with the overall accuracy above or exactly 70 %.
dict_participants_sufficient_accuracy = {}

for key, df in dict_all_participant.items():

    accuracy = df["correct_keyboard_response_1"].mean()
    if accuracy >= 0.70:
        dict_participants_sufficient_accuracy[key] = df


df_all_people_without_outliers = pd.DataFrame()
list_upper_and_lower_values = []

# Working with participants with sufficient overall accuracy, we are excluding outliers per participant per condition.
# We only take correctly answered word trials, as we are preparing a file for the analysis of response times.
for key, df in dict_participants_sufficient_accuracy.items():

    df_word_correct = df[
        (df["correct_keyboard_response_1"] == 1) & (df["target_type"] == "word")
    ]
    subject_number = df["subject_nr"][1]
    subject_number = subject_number.astype(int)

    participant_boundaries = [subject_number]
    df_subject_without_outliers = pd.DataFrame()

    # Looping through all 7 flanker conditions and excluding outliers (more than 2.5 standard deviations above or below the mean value)
    # based on the mean and standard deviation calculated per condition per participant.
    for flanker_condition, df_condition in df_word_correct.groupby("flanker_condition"):

        cond_time_mean = df_condition["response_time_keyboard_response_1"].mean()
        cond_time_std = df_condition["response_time_keyboard_response_1"].std()

        upper_boundary = cond_time_mean + 2.5 * cond_time_std
        lower_boundary = cond_time_mean - 2.5 * cond_time_std

        df_cond_without_outliers = df_condition[
            ((df_condition["response_time_keyboard_response_1"]) > (lower_boundary))
            & ((df_condition["response_time_keyboard_response_1"]) < (upper_boundary))
        ]

        participant_boundaries.extend(
            [flanker_condition, upper_boundary, lower_boundary]
        )
        # This dataframe contains data without outliers of a single participant.
        df_subject_without_outliers = pd.concat(
            [df_cond_without_outliers, df_subject_without_outliers], ignore_index=True
        )
        
    # Upper and lower cut-off boundaries per participant per condition are saved into a list,
    # as they will be applied for defining outliers for the analysis of accuracies as well.
    list_upper_and_lower_values.append(participant_boundaries)

    # This dataframe contains data without outliers for all the participants with the sufficient overall accuracy.
    df_all_people_without_outliers = pd.concat(
        [df_all_people_without_outliers, df_subject_without_outliers], ignore_index=True
    )


# Saving the file with all correctly answered participants' word trials without outliers for the analyis of response times.
df_all_people_without_outliers.to_csv(
    "file_with_data_without_outliers_response_times.csv"
)

Preparing a file with all of the participants' trials and their upper and lower cut-off boundaries that will be used for the analysis of accuracies.

*1) Creating a dataframe with participants' upper and lower boundaries from a previously created list*\
*2) Merging this dataframe with a dataframe tah holds all of the trials of participants that were accurate enough.*

In [2]:
# Same upper and lower cut-off boundaries per participant per condition as for the response times analysis will be applied to define outliers.
# We are firstly creating a dataframe (from a previously created list_upper_and_lower_values) with subject numbers and upper and lower boundaries per condition.
df_limits = pd.DataFrame()
participants = []

for participant in list_upper_and_lower_values:
    unique_ID = participant[0]
    conditions = participant[1:]

    # Looping through the 7 flanker conditions, finding upper and lower boundaries.
    for i in range(0, len(conditions), 3):
        condition = conditions[i]
        upper_value = conditions[i + 1]
        lower_value = conditions[i + 2]
        participants.append([unique_ID, condition, upper_value, lower_value])


df_limits = pd.DataFrame(
    participants,
    columns=["subject_nr", "flanker_condition", "upper_boundary", "lower_boundary"],
)

# Creating a dataframe with all trials from participants with the sufficient overall accuracy and merging it with the newly created dataframe
# with upper and lower boundaries per condition per participant.
df_all_data_with_all_trials = pd.DataFrame()

for key, df in dict_participants_sufficient_accuracy.items():
    df["subject_nr"].astype(int)
    df_all_data_with_all_trials = pd.concat(
        [df_all_data_with_all_trials, df], ignore_index=True
    )


df_all_data_with_boundaries = df_all_data_with_all_trials.merge(
    df_limits,
    left_on=["subject_nr", "flanker_condition"],
    right_on=["subject_nr", "flanker_condition"],
    how="inner",
)

# This dataframe will be used when defining outliers for the incorrectly answered trials for the analysis of accuracies (the next Jupyter Notebook file).
df_all_data_with_boundaries.to_csv("all_data_with_boundaries.csv")