In [1]:
import pandas as pd
import os 

# Get the current working directory
main_dir = os.getcwd()
print(main_dir)


# Define the user IDs to skip
skip_user_ids = {128, 134, 138}

# Function to get file names in a directory
def get_file_names(directory):
        file_names = []
        for root, dirs, files in os.walk(directory):
            for file in files:
                file_names.append(file)
        return sorted(file_names)


# Loop through user IDs from 101 to 140
for user_id in range(101, 141):
    if user_id in skip_user_ids:
        continue

    user_id_str = str(user_id)
    
    # Read java info file
    java_data = pd.read_excel(os.path.join(main_dir, 'java-info.xlsx'))
    java_data_columns = java_data.columns
    print('---------------')

    user_dir = os.path.join(main_dir, 'raw_data', f'p{user_id_str}')
    
    file_names = get_file_names(user_dir)
   
    # Read user info data
    user_data = pd.read_csv(os.path.join(user_dir, 'info.csv'))
    
   
    folder_data = {}
    for file_name in file_names:
        if (file_name.endswith(".xlsx") or file_name.endswith(".csv")) & ("info" not in file_name):
            if file_name.endswith(".csv"):
                print('here:', file_name.replace(".csv", ""))
                fix_data = pd.read_csv(os.path.join(user_dir, file_name))
                fix_data['start'] = fix_data['duration'].shift(fill_value=1).cumsum()
                folder_data[file_name.replace(".csv", "")] = fix_data

    df = pd.DataFrame(columns=[
        'PID','Task', 'bug_number', 'AOI','AOI_No', 'Accuracy_0_1', 'Accept_Y_N', 'FxCount', 'FxRate', 'TotalFixationTime', 'AvgFixationDuration', 'FirstFixation'
    ])
    
    for i, (key, value) in enumerate(folder_data.items()):
        fixation_data = folder_data[key]
        task_number = 'T' + key[key.index("-") + 1]
        bug_number = user_data['order'][i]
        bug_name = f'Bug-{bug_number}.txt'
        bug_general = java_data[(bug_number >= java_data['min']) & (bug_number <= java_data['max'])].iloc[0]
        is_accepted = user_data['trust'][i]
        is_correct = bug_general['quality']
        total_fix_count = len(fixation_data)
        
     
        
        # Construct AOI1: Bug report file
        fix_data_a1 = fixation_data[(fixation_data['fixation_target'] == str(bug_name))]
        total_fix_a1 = fix_data_a1['duration'].sum()
        avg_fix_a1 = 'NA' if len(fix_data_a1) == 0 else total_fix_a1 / len(fix_data_a1)
        first_fix_a1 = 'NA' if len(fix_data_a1) == 0 else fix_data_a1.iloc[0]['start']

        aoi_a1 = pd.DataFrame({
            'PID': user_id, 'Task': task_number, 
            'bug_number': bug_number, 'AOI': 'BugReport', 'AOI_No': 'A1', 'Accuracy_0_1': is_correct,
            'Accept_Y_N': is_accepted, 'FxCount': len(fix_data_a1), 
            'TotalFixationTime': total_fix_a1, 'FxRate': len(fix_data_a1) / total_fix_count, 
            'AvgFixationDuration': avg_fix_a1, 'FirstFixation': first_fix_a1
        }, index=[None])

        # Construct AOI2: Class in source code
        fix_data_a2 = fixation_data[(fixation_data['fixation_target'] == (bug_general['src_class'] + '.java'))]
        total_fix_a2 = fix_data_a2['duration'].sum()
        avg_fix_a2 = 'NA' if len(fix_data_a2) == 0 else total_fix_a2 / len(fix_data_a2)
        first_fix_a2 = 'NA' if len(fix_data_a2) == 0 else fix_data_a2.iloc[0]['start']

        aoi_a2 = pd.DataFrame({
            'PID': user_id, 'Task': task_number, 'bug_number': bug_number, 'AOI': 'RelevantCodeClass', 'AOI_No': 'A2', 'Accuracy_0_1': is_correct, 
             'Accept_Y_N': is_accepted, 'FxCount': len(fix_data_a2), 
            'TotalFixationTime': total_fix_a2, 'FxRate': len(fix_data_a2) / total_fix_count, 
            'AvgFixationDuration': avg_fix_a2, 'FirstFixation': first_fix_a2
        }, index=[None])

        # Construct AOI3: Method in source code(inside class of source code)
        fix_data_a3 = fix_data_a2[(fix_data_a2['source_file_line'] >= bug_general['src_method_start']) & 
                                   (fix_data_a2['source_file_line'] <= bug_general['src_method_end'])]
        total_fix_a3 = fix_data_a3['duration'].sum()
        avg_fix_a3 = 'NA' if len(fix_data_a3) == 0 else total_fix_a3 / len(fix_data_a3)
        first_fix_a3 = 'NA' if len(fix_data_a3) == 0 else fix_data_a3.iloc[0]['start']

        aoi_a3 = pd.DataFrame({
            'PID': user_id, 'Task': task_number,'bug_number': bug_number, 'AOI': 'RelevantCodeMethod', 'AOI_No': 'A3', 'Accuracy_0_1': is_correct, 
             'Accept_Y_N': is_accepted,  'FxCount': len(fix_data_a3), 'TotalFixationTime': total_fix_a3, 'FxRate': len(fix_data_a3) / total_fix_count, 
            'AvgFixationDuration': avg_fix_a3, 'FirstFixation': first_fix_a3
        }, index=[None])
        
        
        # Construct A4: Test class
        fix_data_a4 = fixation_data[(fixation_data['fixation_target'] == (bug_general['test_class']+'.java'))]
        total_fix_a4 = fix_data_a4['duration'].sum()
        avg_fix_a4 = 'NA' if len(fix_data_a4) == 0 else total_fix_a4 / len(fix_data_a4)
        first_fix_a4 = 'NA' if len(fix_data_a4) == 0 else fix_data_a4.iloc[0]['start']

        aoi_a4 = pd.DataFrame({'PID':user_id,'Task':task_number, 'bug_number':bug_number,'AOI': 'RelevantTestClass',
                                'AOI_No': 'A4','Accuracy_0_1':is_correct,'Accept_Y_N':is_accepted,
                               'FxCount':len(fix_data_a4),'TotalFixationTime':total_fix_a4,
                                'FxRate':len(fix_data_a4)/total_fix_count,
                                'AvgFixationDuration':avg_fix_a4,'FirstFixation':first_fix_a4
                                }, index=[None])

        
        # Construct A5: Test method(inside test class)
        fix_data_a5 = fix_data_a4[(fix_data_a4['source_file_line'] >= bug_general['test_method_start']) &
                                   (fix_data_a4['source_file_line'] <= bug_general['test_method_end'])]
        total_fix_a5 = fix_data_a5['duration'].sum()
        avg_fix_a5 = 'NA' if len(fix_data_a5) == 0 else total_fix_a5 / len(fix_data_a5)
        first_fix_a5 = 'NA' if len(fix_data_a5) == 0 else fix_data_a5.iloc[0]['start']

        aoi_a5 = pd.DataFrame({'PID':user_id,'Task':task_number, 'bug_number':bug_number, 'AOI': 'RelevantTestMethod',
                               'AOI_No': 'A5','Accuracy_0_1':is_correct,'Accept_Y_N': is_accepted,
                               'FxCount':len(fix_data_a5),'TotalFixationTime':total_fix_a5,
                               'FxRate':len(fix_data_a5)/total_fix_count,
                                'AvgFixationDuration':avg_fix_a5,'FirstFixation':first_fix_a5
                                }, index=[None])

        # Construct AOI6: Other files
        fix_data_a6 = filtered_data = fixation_data[~((fixation_data['fixation_target'] == str(bug_name)) |
                                    (fixation_data['fixation_target'] == (bug_general['src_class'] + '.java')) |
                                    (fixation_data['fixation_target'] == (bug_general['test_class'] + '.java')))]
        total_fix_a6 = fix_data_a6['duration'].sum()
        avg_fix_a6 = 'NA' if len(fix_data_a6) == 0 else total_fix_a6 / len(fix_data_a6)
        first_fix_a6 = 'NA' if len(fix_data_a6) == 0 else fix_data_a6.iloc[0]['start']


        aoi_a6 = pd.DataFrame({'PID':user_id,'Task':task_number, 'bug_number':bug_number,'AOI': 'Others',
                               'AOI_No': 'A6', 'Accuracy_0_1':is_correct,
                               'Accept_Y_N': is_accepted,
                               'FxCount':len(fix_data_a6),'TotalFixationTime':total_fix_a6,
                               'FxRate':len(fix_data_a6)/total_fix_count,
                               'AvgFixationDuration':avg_fix_a6,'FirstFixation':first_fix_a6
                               }, index=[None])

        # Construct AOI0: All fixations in all files
        total_fix_a7 = fixation_data['duration'].sum()
        avg_fix_a7 = 'NA' if total_fix_count == 0 else total_fix_a7 / total_fix_count


        aoi_all = pd.DataFrame({'PID':user_id,'Task':task_number, 'bug_number':bug_number,'AOI': 'Total',
                                  'AOI_No': 'A0','Accuracy_0_1':is_correct,'Accept_Y_N': is_accepted,'FxCount':total_fix_count,
                                'TotalFixationTime':total_fix_a7,
                                  'FxRate':1,
                                   'AvgFixationDuration':avg_fix_a7,'FirstFixation':'NA'
                                  }, index=[None])
    

        frames = [df, aoi_all, aoi_a1, aoi_a2, aoi_a3, aoi_a4, aoi_a5, aoi_a6]
        df = pd.concat(frames, ignore_index=True)

    folder_path = "generated_fixations/"
    file_name = f'p{user_id_str}_fixations.csv'

    # Create the folder if it doesn't exist
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # Save the DataFrame to a CSV file in the specified folder
    df.to_csv(os.path.join(folder_path, file_name), index=False)
    
    

/Users/sarayabesi/Documents/research-poly/eye-tracker-project/data_extraction
---------------
here: p101-1
here: p101-2
here: p101-3
here: p101-4
here: p101-5
here: p101-6
---------------
here: p102-1
here: p102-2
here: p102-3
here: p102-4
here: p102-5
here: p102-6
---------------
here: p103-1
here: p103-2
here: p103-3
here: p103-4
here: p103-5
here: p103-6
---------------
here: p104-1
here: p104-2
here: p104-3
here: p104-4
here: p104-5
here: p104-6
---------------
here: p105-1
here: p105-2
here: p105-3
here: p105-4
here: p105-5
here: p105-6
---------------
here: p106-1
here: p106-2
here: p106-3
here: p106-4
here: p106-5
here: p106-6
---------------
here: p107-1
here: p107-2
here: p107-3
here: p107-4
here: p107-5
here: p107-6
---------------
here: p108-1
here: p108-2
here: p108-3
here: p108-4
here: p108-5
here: p108-6
---------------
here: p109-1
here: p109-2
here: p109-3
here: p109-4
here: p109-5
here: p109-6
---------------
here: p110-1
here: p110-2
here: p110-3
here: p110-4
here: p1

In [2]:
# merge fixation generated excel files and sort them


import os
import pandas as pd

# Define the folder path where files should be saved
folder_path = os.path.join(main_dir, 'generated_fixations')

# Get a list of all CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith(".csv")]

# Create an empty list to store individual DataFrames
dataframes = []

# Read each CSV file and append its DataFrame to the list
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
initial_df = pd.concat(dataframes)

# Reset the index of the concatenated DataFrame
initial_df.reset_index(drop=True, inplace=True)

# Save initial dataframe with all the users and data
initial_df.to_csv(os.path.join(folder_path, 'all_fixations.csv'), index=False, na_rep='NA')

# Sort the concatenated DataFrame
sorted_df = initial_df.sort_values(by=['PID', 'Task'])

# Save the sorted DataFrame
sorted_df.to_csv(os.path.join(folder_path, 'all_fixations_sorted.csv'), index=False, na_rep='NA')

print("Files saved successfully in the generated_fixations folder.")

Files saved successfully in the generated_fixations folder.
