In [29]:
import os
import pandas as pd
import xml.etree.ElementTree as ET
import datetime
import pytz

# Get the current working directory
main_dir = os.getcwd()

# Directory to save the generated CSV files
calculated_time_dir = os.path.join(main_dir, "calculated_time")

# Ensure the directory exists
os.makedirs(calculated_time_dir, exist_ok=True)

print(main_dir)

# Define the user IDs to skip
skip_user_ids = {128, 134, 138}

# Function to calculate first core_time, last core_time, and total_time for a given XML file
def process_xml_file(file_path, user_id_str):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Find all 'response' elements in the XML
    responses = root.findall('.//response')

    # Extract the first and last core_time values
    first_core_time = int(responses[0].get('core_time'))
    last_core_time = int(responses[-1].get('core_time'))

    # Convert first_core_time to human-readable time in GMT
    first_datetime_gmt = datetime.datetime.fromtimestamp(first_core_time / 1000, pytz.timezone('GMT'))
    first_time_gmt = first_datetime_gmt.strftime('%I:%M:%S%p')

    # Convert last_core_time to human-readable time in GMT
    last_datetime_gmt = datetime.datetime.fromtimestamp(last_core_time / 1000, pytz.timezone('GMT'))
    last_time_gmt = last_datetime_gmt.strftime('%I:%M:%S%p')

    # Calculate the total_time
    total_time = last_datetime_gmt - first_datetime_gmt

    # Convert total_time to milliseconds
    total_time_ms = total_time.total_seconds() * 1000

    # Get the file name without the extension
    file_name = os.path.splitext(os.path.basename(file_path))[0]

    # Return the processed data as a dictionary
    return {
        'PID': user_id_str, 
        'Task': 'T' + file_name.split('-')[1], 
        'first_time': first_time_gmt, 
        'last_time': last_time_gmt, 
        'total_time': total_time, 
        'total_time_ms': total_time_ms
    }

# Loop through user IDs from 101 to 140
for user_id in range(101, 141):
    if user_id in skip_user_ids:
        continue

    user_id_str = str(user_id)
    
    # Directory containing raw XML data for each user
    user_dir = os.path.join(main_dir, 'raw_data_time', f'p{user_id_str}')
    
    # Process each XML file in the folder and store the results in a list
    results = []
    for file_name in sorted(os.listdir(user_dir)):
        if file_name.endswith('.xml'):
            file_path = os.path.join(user_dir, file_name)
            file_result = process_xml_file(file_path, user_id_str)
            results.append(file_result)

    # Create a data frame from the results list
    df = pd.DataFrame(results)

    # Sort the data frame by 'PID' and 'Task' columns
    df.sort_values(['PID', 'Task'], inplace=True)

    # Reset the index of the data frame
    df.reset_index(drop=True, inplace=True)

    # Save the data frame to a CSV file in the calculated_time directory
    csv_file_path = os.path.join(calculated_time_dir, f'p{user_id_str}_time_ms.csv')
    df.to_csv(csv_file_path, index=False)


/Users/sarayabesi/Documents/research-poly/eye-tracker-project/data_extraction


In [30]:
import os
import pandas as pd

# Directory containing the generated CSV files
folder_path = os.path.join(os.getcwd(), 'calculated_time')

# Get a list of all CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith(".csv")]

# Create an empty list to store individual DataFrames
dataframes = []

# Read each CSV file and append its DataFrame to the list
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
initial_df = pd.concat(dataframes)

# Reset the index of the concatenated DataFrame
initial_df.reset_index(drop=True, inplace=True)

# Save initial DataFrame with all the users and data
initial_df.to_csv(os.path.join(folder_path, 'all_times.csv'), index=False, na_rep='NA')

# Sort the concatenated DataFrame by 'PID' and 'Task'
sorted_df = initial_df.sort_values(by=['PID', 'Task'])

# Save the sorted DataFrame
sorted_df.to_csv(os.path.join(folder_path, 'all_times_sorted.csv'), index=False, na_rep='NA')

print("Files saved successfully in the calculated_time folder.")

Files saved successfully in the calculated_time folder.
