In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm


In [4]:
import os 
main_dir = os.getcwd()
print(main_dir)


# Loop through user IDs from 101 to 140

# Define the user IDs to skip
skip_user_ids = {128, 134, 138}

for user_id in range(101, 141):
    if user_id in skip_user_ids:
        continue

    user_id = str(user_id)
    
    # Read java info file
    java_data = pd.read_excel(main_dir + '/java-info.xlsx')
    java_data_columns = java_data.columns
    #print(java_data_columns)

    user_dir = main_dir + '/raw_data/p' + user_id
    survey_dir = main_dir + '/surveys'

    def get_file_names(directory):
        file_names = []
        for root, dirs, files in os.walk(directory):
            for file in files:
                file_names.append(file)
        return sorted(file_names)

    file_names = get_file_names(user_dir)
    #print(file_names)

    survey_names = get_file_names(survey_dir)
    #print(survey_names)

    # Read user info data
    user_data = pd.read_csv(user_dir + '/info.csv')
    
    q_data = []
    for survey_name in survey_names:
        q_data.append(pd.read_excel(survey_dir + '/' + survey_name))
        
    print(len(q_data))
    folder_data = {}
    for file_name in file_names:
        if (file_name.endswith(".xlsx") or file_name.endswith(".csv")) & ("info" not in file_name):
            if file_name.endswith(".csv"):
                print('here:', file_name.replace(".csv", ""))
                fix_data = pd.read_csv(user_dir + '/' + file_name)
                fix_data['start'] = fix_data['duration'].shift(fill_value=1).cumsum()
                folder_data[file_name.replace(".csv", "")] = fix_data

    df = pd.DataFrame(columns=[
        'PID', 'Age', 'Gender', 'Ethnicity', 'Language', 'Class_attending', 'Q8_Code_experience_1_to_10',
        'Q10_years_coding_1_to_10', 'Q11_OOP_experience_1_to_10', 'Q12_Java_experience_1_to_10',
        'Author_impact_Y_N', 'Urgency_impact_Y_N', 'Task_difficulty', 'Task', 'bug_number', 'AOI',
        'AOI_No', 'Priority_Low_High', 'Expertise_Expert_Novice', 'Accuracy_0_1', 'Coding_Style_1_to_5',
        'Readability_1_to_5', 'Summary_1_to_5', 'Functionality_1_to_5', 'Patch_Quality_1_to_5', 
        'Accept_Y_N', 'Trust_1_5', 'FxCount', 'FxRate', 'TotalFixationTime', 'AvgFixationDuration', 'FirstFixation'
    ])
    
    for i, (key, value) in enumerate(folder_data.items()):
        fixation_data = folder_data[key]
        task_number = 'T' + key[key.index("-") + 1]
        bug_number = user_data['order'][i]
        bug_name = 'Bug-' + str(bug_number) + '.txt'
        bug_general = java_data[(bug_number >= java_data['min']) & (bug_number <= java_data['max'])].iloc[0]
        is_accepted = user_data['trust'][i]
        is_correct = bug_general['quality']
        total_fix_count = len(fixation_data)
        
        expertise = 0
        urgency = 0
        
        # 6 questionnaires
        style = q_data[i][q_data[i]['Participant ID'] == int(user_id)].iloc[0, 2]
        readability = q_data[i][q_data[i]['Participant ID'] == int(user_id)].iloc[0, 3]
        summary = q_data[i][q_data[i]['Participant ID'] == int(user_id)].iloc[0, 4]
        functionality = q_data[i][q_data[i]['Participant ID'] == int(user_id)].iloc[0, 5]
        quality = q_data[i][q_data[i]['Participant ID'] == int(user_id)].iloc[0, 6]
        accept = q_data[i][q_data[i]['Participant ID'] == int(user_id)].iloc[0, 7]
        trust = q_data[i][q_data[i]['Participant ID'] == int(user_id)].iloc[0, 8]
        
        # 1 post questionnaire / it is the 7th variable in the survey_names array
        age = q_data[6][q_data[6]['Participant ID'] == int(user_id)].iloc[0, 3]
        gender = q_data[6][q_data[6]['Participant ID'] == int(user_id)].iloc[0, 4]
        ethnicity = q_data[6][q_data[6]['Participant ID'] == int(user_id)].iloc[0, 5]
        language = q_data[6][q_data[6]['Participant ID'] == int(user_id)].iloc[0, 6]
        class_attending = q_data[6][q_data[6]['Participant ID'] == int(user_id)].iloc[0, 8]
        code_exp = q_data[6][q_data[6]['Participant ID'] == int(user_id)].iloc[0, 10]
        code_years = q_data[6][q_data[6]['Participant ID'] == int(user_id)].iloc[0, 12]
        oop_exp = q_data[6][q_data[6]['Participant ID'] == int(user_id)].iloc[0, 13]
        java_exp = q_data[6][q_data[6]['Participant ID'] == int(user_id)].iloc[0, 14]
        author = q_data[6][q_data[6]['Participant ID'] == int(user_id)].iloc[0, 21]
        urg = q_data[6][q_data[6]['Participant ID'] == int(user_id)].iloc[0, 20]
        difficulty = q_data[6][q_data[6]['Participant ID'] == int(user_id)].iloc[0, 17]

        #############construct A1
        #######bug file fixations
        fix_data_a1 = fixation_data[(fixation_data['fixation_target'] == str(bug_name))]
        total_fix_a1 = fix_data_a1['duration'].sum()
        avg_fix_a1 = 'NA' if len(fix_data_a1) == 0 else total_fix_a1 / len(fix_data_a1)
        first_fix_a1 = 'NA' if len(fix_data_a1) == 0 else fix_data_a1.iloc[0]['start']

        aoi_a1 = pd.DataFrame({
            'PID': user_id, 'Age': age, 'Gender': gender, 'Ethnicity': ethnicity, 'Language': language, 
            'Class_attending': class_attending, 'Q8_Code_experience_1_to_10': code_exp, 
            'Q10_years_coding_1_to_10': code_years, 'Q11_OOP_experience_1_to_10': oop_exp, 
            'Q12_Java_experience_1_to_10': java_exp, 'Author_impact_Y_N': author, 
            'Urgency_impact_Y_N': urg, 'Task_difficulty': difficulty, 'Task': task_number, 
            'bug_number': bug_number, 'AOI': 'BugReport', 'AOI_No': 'A1', 
            'Priority_Low_High': urgency, 'Expertise_Expert_Novice': expertise, 'Accuracy_0_1': is_correct, 
            'Coding_Style_1_to_5': style, 'Readability_1_to_5': readability, 
            'Summary_1_to_5': summary, 'Functionality_1_to_5': functionality, 'Patch_Quality_1_to_5': quality, 
            'Accept_Y_N': accept, 'Trust_1_5': trust, 'FxCount': len(fix_data_a1), 
            'TotalFixationTime': total_fix_a1, 'FxRate': len(fix_data_a1) / total_fix_count, 
            'AvgFixationDuration': avg_fix_a1, 'FirstFixation': first_fix_a1
        }, index=[None])

        ############construct A2
        ###### source file
        fix_data_a2 = fixation_data[(fixation_data['fixation_target'] == (bug_general['src_class'] + '.java'))]
        total_fix_a2 = fix_data_a2['duration'].sum()
        avg_fix_a2 = 'NA' if len(fix_data_a2) == 0 else total_fix_a2 / len(fix_data_a2)
        first_fix_a2 = 'NA' if len(fix_data_a2) == 0 else fix_data_a2.iloc[0]['start']

        aoi_a2 = pd.DataFrame({
            'PID': user_id, 'Age': age, 'Gender': gender, 'Ethnicity': ethnicity, 'Language': language, 
            'Class_attending': class_attending, 'Q8_Code_experience_1_to_10': code_exp, 
            'Q10_years_coding_1_to_10': code_years, 'Q11_OOP_experience_1_to_10': oop_exp, 
            'Q12_Java_experience_1_to_10': java_exp, 'Author_impact_Y_N': author, 
            'Urgency_impact_Y_N': urg, 'Task_difficulty': difficulty, 'Task': task_number, 
            'bug_number': bug_number, 'AOI': 'RelevantCodeClass', 'AOI_No': 'A2', 
            'Priority_Low_High': urgency, 'Expertise_Expert_Novice': expertise, 'Accuracy_0_1': is_correct, 
            'Coding_Style_1_to_5': style, 'Readability_1_to_5': readability, 
            'Summary_1_to_5': summary, 'Functionality_1_to_5': functionality, 'Patch_Quality_1_to_5': quality, 
            'Accept_Y_N': accept, 'Trust_1_5': trust, 'FxCount': len(fix_data_a2), 
            'TotalFixationTime': total_fix_a2, 'FxRate': len(fix_data_a2) / total_fix_count, 
            'AvgFixationDuration': avg_fix_a2, 'FirstFixation': first_fix_a2
        }, index=[None])

        #############construct A3
        ####### inside the specific source file methods
        fix_data_a3 = fix_data_a2[(fix_data_a2['source_file_line'] >= bug_general['src_method_start']) & 
                                   (fix_data_a2['source_file_line'] <= bug_general['src_method_end'])]
        total_fix_a3 = fix_data_a3['duration'].sum()
        avg_fix_a3 = 'NA' if len(fix_data_a3) == 0 else total_fix_a3 / len(fix_data_a3)
        first_fix_a3 = 'NA' if len(fix_data_a3) == 0 else fix_data_a3.iloc[0]['start']

        aoi_a3 = pd.DataFrame({
            'PID': user_id, 'Age': age, 'Gender': gender, 'Ethnicity': ethnicity, 'Language': language, 
            'Class_attending': class_attending, 'Q8_Code_experience_1_to_10': code_exp, 
            'Q10_years_coding_1_to_10': code_years, 'Q11_OOP_experience_1_to_10': oop_exp, 
            'Q12_Java_experience_1_to_10': java_exp, 'Author_impact_Y_N': author, 
            'Urgency_impact_Y_N': urg, 'Task_difficulty': difficulty, 'Task': task_number, 
            'bug_number': bug_number, 'AOI': 'RelevantCodeMethod', 'AOI_No': 'A3', 
            'Priority_Low_High': urgency, 'Expertise_Expert_Novice': expertise, 'Accuracy_0_1': is_correct, 
            'Coding_Style_1_to_5': style, 'Readability_1_to_5': readability, 
            'Summary_1_to_5': summary, 'Functionality_1_to_5': functionality, 'Patch_Quality_1_to_5': quality, 
            'Accept_Y_N': accept, 'Trust_1_5': trust, 'FxCount': len(fix_data_a3), 
            'TotalFixationTime': total_fix_a3, 'FxRate': len(fix_data_a3) / total_fix_count, 
            'AvgFixationDuration': avg_fix_a3, 'FirstFixation': first_fix_a3
        }, index=[None])
        
        ############construct A4
    # test class
    
        fix_data_a4 = fixation_data[(fixation_data['fixation_target'] == (bug_general['test_class']+'.java'))]
        total_fix_a4 = fix_data_a4['duration'].sum()
        avg_fix_a4 = 'NA' if len(fix_data_a4) == 0 else total_fix_a4 / len(fix_data_a4)
        first_fix_a4 = 'NA' if len(fix_data_a4) == 0 else fix_data_a4.iloc[0]['start']

        aoi_a4 = pd.DataFrame({'PID':user_id,'Age':age,
                               'Gender': gender,'Ethnicity':ethnicity,'Language':language,'Class_attending':class_attending,
                               'Q8_Code_experience_1_to_10':code_exp,
                               'Q10_years_coding_1_to_10': code_years,'Q11_OOP_experience_1_to_10':oop_exp,
                               'Q12_Java_experience_1_to_10':java_exp,
                               'Author_impact_Y_N':author,'Urgency_impact_Y_N':urg,'Task_difficulty':difficulty, 

                               'Task':task_number, 'bug_number':bug_number,'AOI': 'RelevantTestClass',
                                'AOI_No': 'A4','Priority_Low_High':urgency,'Expertise_Expert_Novice':expertise,
                               'Accuracy_0_1':is_correct,
                               'Coding_Style_1_to_5': style,'Readability_1_to_5': readability,
                               'Summary_1_to_5' : summary,'Functionality_1_to_5' : functionality,
                               'Patch_Quality_1_to_5': quality,'Accept_Y_N': accept,'Trust_1_5': trust,
                               'FxCount':len(fix_data_a4),'TotalFixationTime':total_fix_a4,
                                'FxRate':len(fix_data_a4)/total_fix_count,
                                'AvgFixationDuration':avg_fix_a4,'FirstFixation':first_fix_a4
                                }, index=[None])

        ############construct A5
        #inside the specific test class methods

        fix_data_a5 = fix_data_a4[(fix_data_a4['source_file_line'] >= bug_general['test_method_start']) &
                                   (fix_data_a4['source_file_line'] <= bug_general['test_method_end'])]
        total_fix_a5 = fix_data_a5['duration'].sum()
        avg_fix_a5 = 'NA' if len(fix_data_a5) == 0 else total_fix_a5 / len(fix_data_a5)
        first_fix_a5 = 'NA' if len(fix_data_a5) == 0 else fix_data_a5.iloc[0]['start']

        aoi_a5 = pd.DataFrame({'PID':user_id,'Age':age,
                               'Gender': gender,'Ethnicity':ethnicity,'Language':language,'Class_attending':class_attending,
                               'Q8_Code_experience_1_to_10':code_exp,
                               'Q10_years_coding_1_to_10': code_years,'Q11_OOP_experience_1_to_10':oop_exp,
                               'Q12_Java_experience_1_to_10':java_exp,
                               'Author_impact_Y_N':author,'Urgency_impact_Y_N':urg,'Task_difficulty':difficulty, 

                               'Task':task_number, 'bug_number':bug_number, 'AOI': 'RelevantTestMethod',
                               'AOI_No': 'A5','Priority_Low_High':urgency,'Expertise_Expert_Novice':expertise,
                               'Accuracy_0_1':is_correct,
                               'Coding_Style_1_to_5': style,'Readability_1_to_5': readability,
                               'Summary_1_to_5' : summary,'Functionality_1_to_5' : functionality,
                               'Patch_Quality_1_to_5': quality,'Accept_Y_N': accept,'Trust_1_5': trust,
                               'FxCount':len(fix_data_a5),'TotalFixationTime':total_fix_a5,
                               'FxRate':len(fix_data_a5)/total_fix_count,
                                'AvgFixationDuration':avg_fix_a5,'FirstFixation':first_fix_a5
                                }, index=[None])

        ############construct A6
        ###### others

        #not a bug file,source file or test file
        fix_data_a6 = filtered_data = fixation_data[~((fixation_data['fixation_target'] == str(bug_name)) |
                                    (fixation_data['fixation_target'] == (bug_general['src_class'] + '.java')) |
                                    (fixation_data['fixation_target'] == (bug_general['test_class'] + '.java')))]
        total_fix_a6 = fix_data_a6['duration'].sum()
        avg_fix_a6 = 'NA' if len(fix_data_a6) == 0 else total_fix_a6 / len(fix_data_a6)
        first_fix_a6 = 'NA' if len(fix_data_a6) == 0 else fix_data_a6.iloc[0]['start']


        aoi_a6 = pd.DataFrame({'PID':user_id,'Age':age,
                               'Gender': gender,'Ethnicity':ethnicity,'Language':language,'Class_attending':class_attending,
                               'Q8_Code_experience_1_to_10':code_exp,
                               'Q10_years_coding_1_to_10': code_years,'Q11_OOP_experience_1_to_10':oop_exp,
                               'Q12_Java_experience_1_to_10':java_exp,
                               'Author_impact_Y_N':author,'Urgency_impact_Y_N':urg,'Task_difficulty':difficulty, 

                               'Task':task_number, 'bug_number':bug_number,'AOI': 'Others',
                               'AOI_No': 'A6','Priority_Low_High':urgency,'Expertise_Expert_Novice':expertise,
                               'Accuracy_0_1':is_correct,
                               'Coding_Style_1_to_5': style,'Readability_1_to_5': readability,
                               'Summary_1_to_5' : summary,'Functionality_1_to_5' : functionality,
                               'Patch_Quality_1_to_5': quality,'Accept_Y_N': accept,'Trust_1_5': trust,
                               'FxCount':len(fix_data_a6),'TotalFixationTime':total_fix_a6,
                               'FxRate':len(fix_data_a6)/total_fix_count,
                               'AvgFixationDuration':avg_fix_a6,'FirstFixation':first_fix_a6
                               }, index=[None])

        ############construct ALL
        ######### all fixations
        total_fix_a7 = fixation_data['duration'].sum()
        avg_fix_a7 = 'NA' if total_fix_count == 0 else total_fix_a7 / total_fix_count


        aoi_all = pd.DataFrame({'PID':user_id,'Age':age,
                                  'Gender': gender,'Ethnicity':ethnicity,'Language':language,'Class_attending':class_attending,
                               'Q8_Code_experience_1_to_10':code_exp,
                               'Q10_years_coding_1_to_10': code_years,'Q11_OOP_experience_1_to_10':oop_exp,
                               'Q12_Java_experience_1_to_10':java_exp,
                               'Author_impact_Y_N':author,'Urgency_impact_Y_N':urg,'Task_difficulty':difficulty, 

                                  'Task':task_number, 'bug_number':bug_number,'AOI': 'Total',
                                  'AOI_No': 'A0','Priority_Low_High':urgency,'Expertise_Expert_Novice':expertise,
                                  'Accuracy_0_1':is_correct,
                                  'Coding_Style_1_to_5': style , 'Readability_1_to_5': readability,
                                'Summary_1_to_5' : summary,'Functionality_1_to_5' : functionality,
                                'Patch_Quality_1_to_5': quality,'Accept_Y_N': accept,'Trust_1_5': trust,
                                  'FxCount':total_fix_count,'TotalFixationTime':total_fix_a7,
                                  'FxRate':1,
                                   'AvgFixationDuration':avg_fix_a7,'FirstFixation':'NA'
                                  }, index=[None])
    

        frames = [df, aoi_all, aoi_a1, aoi_a2, aoi_a3, aoi_a4, aoi_a5, aoi_a6]
        df = pd.concat(frames, ignore_index=True)

    folder_path = "generated_fixations_surveys/"
    file_name = 'p' + user_id + '_fixations.csv'

    # Create the folder if it doesn't exist
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # Save the DataFrame to a CSV file in the specified folder
    df.to_csv(os.path.join(folder_path, file_name), index=False)
    
    

/Users/sarayabesi/Documents/research-poly/eye-tracker/experiment_data
7
here: p101-1
here: p101-2
here: p101-3
here: p101-4
here: p101-5
here: p101-6
7
here: p102-1
here: p102-2
here: p102-3
here: p102-4
here: p102-5
here: p102-6
7
here: p103-1
here: p103-2
here: p103-3
here: p103-4
here: p103-5
here: p103-6
7
here: p104-1
here: p104-2
here: p104-3
here: p104-4
here: p104-5
here: p104-6
7
here: p105-1
here: p105-2
here: p105-3
here: p105-4
here: p105-5
here: p105-6
7
here: p106-1
here: p106-2
here: p106-3
here: p106-4
here: p106-5
here: p106-6
7
here: p107-1
here: p107-2
here: p107-3
here: p107-4
here: p107-5
here: p107-6
7
here: p108-1
here: p108-2
here: p108-3
here: p108-4
here: p108-5
here: p108-6
7
here: p109-1
here: p109-2
here: p109-3
here: p109-4
here: p109-5
here: p109-6
7
here: p110-1
here: p110-2
here: p110-3
here: p110-4
here: p110-5
here: p110-6
7
here: p111-1
here: p111-2
here: p111-3
here: p111-4
here: p111-5
here: p111-6
7
here: p112-1
here: p112-2
here: p112-3
here: p11

In [5]:
# merge fixation generated excel files and sort them


import os
import pandas as pd

# Define the folder path where files should be saved
#folder_path = "/Users/sarayabesi/Documents/research-poly/eye-tracker/generated_fixations/"

folder_path = main_dir + '/generated_fixations_surveys'

# Get a list of all CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith(".csv")]

# Create an empty list to store individual DataFrames
dataframes = []

# Read each CSV file and append its DataFrame to the list
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
initial_df = pd.concat(dataframes)

# Reset the index of the concatenated DataFrame
initial_df.reset_index(drop=True, inplace=True)

# Save initial dataframe with all the users and data
initial_df.to_csv(os.path.join(folder_path, 'all_fixations.csv'), index=False, na_rep='NA')

# Sort the concatenated DataFrame
sorted_df = initial_df.sort_values(by=['PID', 'Task'])

# Save the sorted DataFrame
sorted_df.to_csv(os.path.join(folder_path, 'all_fixations_sorted.csv'), index=False, na_rep='NA')

print("Files saved successfully in the generated_fixations folder.")

Files saved successfully in the generated_fixations folder.
