Moving onto word prediction! Must restructure data 

In [2]:
import os
import pandas as pd
import glob
import re # Import the regular expressions library

# --- Configuration ---
# Set the path to your main data folder
data_folder = "quest_training_data/"

print("Starting Smart Data Concatenation...")
print(f"Root data folder: {data_folder}\n")

Starting Smart Data Concatenation...
Root data folder: quest_training_data/



In [16]:



# --- Data Structure to Hold Results ---
# We will store each participant's complete session in this dictionary.
# Key: participant_id (e.g., 'ptx_01')
# Value: A single DataFrame with all their key presses in order.
all_session_data = {}


# --- Discover Participant Folders ---
# Get a list of all items in the data_folder that are directories
try:
    participant_folders = [f.path for f in os.scandir(data_folder) if f.is_dir()]
except FileNotFoundError:
    print(f"ERROR: The directory '{data_folder}' was not found. Please check the path.")
    participant_folders = []

if not participant_folders:
    print("No participant folders found. Exiting.")
else:
    print(f"Found {len(participant_folders)} participant folders to process.\n")


# --- Main Processing Loop ---
for folder_path in participant_folders:
    participant_id = os.path.basename(folder_path)
    print(f"--- Processing Participant: {participant_id} ---")

    # Find all CSV files for this participant
    # The '/**/' part makes the search recursive if needed, but here we target the folder directly.
    search_pattern = os.path.join(folder_path, '*.csv')
    csv_files = glob.glob(search_pattern)

    if not csv_files:
        print("  -> No CSV files found, skipping.\n")
        continue

    # --- CRITICAL: Sort files chronologically ---
    # We extract the numerical part of the filename to sort correctly.
    # This assumes filenames like '0_Master_ptx_01_..._162.29.csv' where '162.29' is the timestamp.
    # We will sort based on the final number in the filename.
    def get_sort_key(filepath):
        # Extracts the last floating point number from the filename
        matches = re.findall(r'(\d+\.\d+)\.csv', os.path.basename(filepath))
        if matches:
            return float(matches[-1])
        return 0 # Default if no number is found

    csv_files.sort(key=get_sort_key)

    # List to hold the data from each file for this participant
    list_of_dfs_for_session = []

    print(f"  -> Found {len(csv_files)} files. Concatenating in order...")
    for file in csv_files:
        try:
            # Note the tab delimiter!
            df = pd.read_csv(file, delimiter='\t')
            list_of_dfs_for_session.append(df)
        except Exception as e:
            print(f"    -> ERROR processing {os.path.basename(file)}. Error: {e}")

    # --- Concatenate all DataFrames for this session ---
    if list_of_dfs_for_session:
        # ignore_index=True re-creates a clean index for the new continuous DataFrame
        session_df = pd.concat(list_of_dfs_for_session, ignore_index=True)

        # Store the complete session DataFrame
        all_session_data[participant_id] = session_df

        print(f"  -> Success! Created a session DataFrame with shape: {session_df.shape}")

        

        #print(session_df["TimeStamp"].head(5))  # Display first 5 timestamps for sanity check
        print(session_df.head())
        print(session_df.columns)
        print(session_df.dtypes)
        print(session_df['TimeStamp'])
        # Optional: check for time gaps
        #time_diff = session_df['TimeStamp'].diff().max()  # Get the maximum time difference between consecutive rows

        #print(time_diff)
        #print(f"  -> Sanity Check: Max time delta between rows is {time_diff:.4f}s")
    print("-" * 30 + "\n")


print("\n--- Processing Complete ---")
print(f"Successfully created session data for {len(all_session_data)} participants.")

# You can now access any participant's data like this:
# example_participant = 'ptx_01'
# if example_participant in all_session_data:
#     print(f"\nExample DataFrame for '{example_participant}':")
#     print(all_session_data[example_participant].head())

Found 1 participant folders to process.

--- Processing Participant: ptx_01_x2 ---
  -> Found 4 files. Concatenating in order...
  -> Success! Created a session DataFrame with shape: (437, 1)
  TrialNumber,Participant_ID,Phrase_ID,Phrase,TimeStamp,Word,LetterIndex,Current_Letter,Pressed_Letter,KeyPressFlag,Meta_L_Palm_LOCAL_X,Meta_L_Palm_LOCAL_Y,Meta_L_Palm_LOCAL_Z,Meta_L_Thumb_Meta_LOCAL_X,Meta_L_Thumb_Meta_LOCAL_Y,Meta_L_Thumb_Meta_LOCAL_Z,Meta_L_Thumb_Prox_LOCAL_X,Meta_L_Thumb_Prox_LOCAL_Y,Meta_L_Thumb_Prox_LOCAL_Z,Meta_L_Thumb_Inter_LOCAL_X,Meta_L_Thumb_Inter_LOCAL_Y,Meta_L_Thumb_Inter_LOCAL_Z,Meta_L_Thumb_Distal_LOCAL_X,Meta_L_Thumb_Distal_LOCAL_Y,Meta_L_Thumb_Distal_LOCAL_Z,Meta_L_Thumb_End_LOCAL_X,Meta_L_Thumb_End_LOCAL_Y,Meta_L_Thumb_End_LOCAL_Z,Meta_L_Index_Meta_LOCAL_X,Meta_L_Index_Meta_LOCAL_Y,Meta_L_Index_Meta_LOCAL_Z,Meta_L_Index_Prox_LOCAL_X,Meta_L_Index_Prox_LOCAL_Y,Meta_L_Index_Prox_LOCAL_Z,Meta_L_Index_Inter_LOCAL_X,Meta_L_Index_Inter_LOCAL_Y,Meta_L_Index_Inter_LOCAL_Z

KeyError: 'TimeStamp'