In [None]:
import os
import json
import pandas as pd
import time

# Define the root directory
ROOT_DIR = "DARWIN-RAW_experiment" # ADJUST NAME

# Function to read metadata text file
def read_metadata(metadata_path, folder_id):
    metadata = {"folder_id": folder_id}  # Include folder_id for reference
    with open(metadata_path, "r", encoding="latin-1") as f:
        for line in f:
            key, value = line.strip().split(": ", 1)  # Splitting "Key: Value" construction of metadata text files
            metadata[key] = int(value) if value.isdigit() else value  # Convert numbers
    return metadata

# Function to read data_entries text file
def read_data_file(file_path):
    data = []
    with open(file_path, "r", encoding="latin-1") as f:
        for line in f:
            parts = line.strip().split(" ")  # Values are separated by one space are added in a list
            data.append([int(parts[0]), int(parts[1]), int(parts[2]), int(parts[3]), parts[4]])          #??????? We dont need to part[3] (either 0/1) given we got the part[2] (pressure) , reducing the size of the dataset
    return data

start_time = time.time()

# Lists to store metadata and data separately
metadata_list = []
data_entries = []

# Iterations through folders and files
for folder in sorted(os.listdir(ROOT_DIR)):  # create a list of folders which are participants
    folder_path = os.path.join(ROOT_DIR, folder)
    if os.path.isdir(folder_path):
        files = sorted(os.listdir(folder_path)) # create a list of files which are either writing tasks or participant metadata

        # Detect metadata file first (always starts with "anagrafica_")
        metadata_file = next((f for f in files if f.startswith("anagrafica_")), None) # since we expect only one file we take the first file we encounter. 
        if not metadata_file:
            continue  # Skip if no metadata file found (unlikely)

        metadata_path = os.path.join(folder_path, metadata_file)
        metadata = read_metadata(metadata_path, folder)
        metadata_list.append(metadata)

        # Read the remaining 25 data_entries files
        for data_file in files:
            if data_file.endswith(".txt") and not data_file.startswith("anagrafica_"):
                file_path = os.path.join(folder_path, data_file)
                
                task_name = os.path.splitext(data_file)[0]

                data_entries.append({
                    "participant": folder,
                    "task": data_file.replace(".txt", ""),
                    "data": read_data_file(file_path)
                })

end_time = time.time()
execution_time = end_time - start_time
print(f"Code execution time: {execution_time} sec")

print(len(data_entries))  # To see how many entries are in the dataset
print(data_entries[:1])  # Check first few entries


Code execution time: 33.41648507118225 sec
4350
[{'participant': 'H01', 'task': 'T1', 'data': [[6078, 10416, 0, 0, '10:45:37.089'], [6078, 10416, 0, 0, '10:45:37.094'], [6100, 10440, 0, 0, '10:45:37.099'], [6133, 10474, 0, 0, '10:45:37.104'], [6175, 10521, 0, 0, '10:45:37.109'], [6223, 10578, 0, 0, '10:45:37.114'], [6258, 10621, 0, 0, '10:45:37.119'], [6290, 10666, 0, 0, '10:45:37.124'], [6319, 10706, 0, 0, '10:45:37.129'], [6347, 10743, 0, 0, '10:45:37.134'], [6369, 10778, 0, 0, '10:45:37.139'], [6389, 10809, 0, 0, '10:45:37.144'], [6403, 10836, 0, 0, '10:45:37.149'], [6417, 10861, 0, 0, '10:45:37.154'], [6425, 10878, 0, 0, '10:45:37.159'], [6432, 10891, 0, 0, '10:45:37.164'], [6432, 10891, 0, 0, '10:45:37.169'], [6432, 10891, 0, 0, '10:45:37.174'], [6432, 10891, 0, 0, '10:45:37.179'], [6422, 10871, 0, 0, '10:45:37.184'], [6410, 10848, 0, 0, '10:50:21.174'], [6379, 10789, 0, 0, '10:50:21.179'], [6359, 10756, 0, 0, '10:50:21.184'], [6338, 10721, 0, 0, '10:50:21.189'], [6313, 10681, 0, 

In [6]:
start_time = time.time()

# Save metadata separately as a Pandas DataFrame
metadata_df = pd.DataFrame(metadata_list)
metadata_df.to_csv("metadata.csv", index=False, encoding="latin-1")  # Save as CSV

# Save data_entires in JSON
with open("data_entries.json", "w", encoding="UTF-8") as f:
    json.dump(data_entries, f, indent=None, ensure_ascii=True)

end_time = time.time()
execution_time = end_time - start_time
print(f"Code execution time: {execution_time} sec")

Code execution time: 70.52170395851135 sec
