In [1]:
### INPUT IS THE DIRECTORY FOR ALL FILES
### OUTPUT: IF XML CONTAINS 3 LASER SACCADE THEN, OUTPUT = DF OD CLASSIFICATION OF EACH IND SACCADE - 
#% of all saccades in the test that are normal accuracy, hyper and hypo

In [2]:
#SETTING UP THE FILES TO BE PROCESSED
import os
import shutil

def move_files_to_main_directory(root_dir):
    """Moves all files from subfolders into the main directory."""
    for dirpath, _, filenames in os.walk(root_dir):
        for filename in filenames:
            if dirpath != root_dir:
                source_path = os.path.join(dirpath, filename)
                destination_path = os.path.join(root_dir, filename)
                shutil.move(source_path, destination_path)

root_directory = "/Volumes/LTY-Photos/DatasetCollection/PCMdata/2023_July"
move_files_to_main_directory(root_directory)
print("Files moved successfully.")


Files moved successfully.


In [10]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import zipfile
import io
from tqdm import tqdm

def plot_saccade_waveform(root_directory, plot=False):
    results_df = pd.DataFrame()

    for folder in os.listdir(root_directory):
        folder_path = os.path.join(root_directory, folder)
        if os.path.isfile(folder_path):
            continue
        for zip_filename in tqdm(os.listdir(folder_path)):
            if not zip_filename.endswith(".zip") or zip_filename.startswith("."):
                print("ignore:", zip_filename)
                continue
            with zipfile.ZipFile(os.path.join(folder_path, zip_filename), 'r') as zip_ref:
                # List all files in the ZIP archive
                file_list = zip_ref.namelist()
                
                # Identify the CSV file(s), assuming one XML file exists
                xml_filenames = [f for f in file_list if f.endswith('.xml')]
                if len(xml_filenames) != 1:
                    print("ignore zip file with", len(xml_filenames), "xml files!")
                    continue
                xml_filename = xml_filenames[0]
                with zip_ref.open(xml_filename) as xml_file:
                    # df = pd.read_csv(file)
            #         print(df.head())

            # # for filename in os.listdir(folder_path):
            #     if filename.endswith(".xml"):
            #         file_path = os.path.join(folder_path, filename)
                    tree = ET.parse(xml_file)
                    root = tree.getroot()
                    
                    # Handle namespace dynamically
                    namespace = {'ns': root.tag.split('}')[0].strip('{')} if '}' in root.tag else {}

                    saccade_summary = []
                    saccade_data = []

                    # Extract both test and beat data in a single pass
                    for saccade in root.findall('.//ns:VW_SaccadeTest', namespace):
                        saccade_summary.append({
                            'TestUID': saccade.findtext('ns:TestUID', default='', namespaces=namespace),
                            'AvgAccuracyHRRightward': saccade.findtext('ns:AvgAccuracyHRRightward', default='0', namespaces=namespace),
                            'AvgAccuracyHRLeftward': saccade.findtext('ns:AvgAccuracyHRLeftward', default='0', namespaces=namespace),
                        })

                    for saccade in root.findall('.//ns:Saccade', namespace):
                        saccade_data.append({
                            'TestUID': saccade.findtext('ns:TestUID', default='', namespaces=namespace),
                            'TimeMs': saccade.findtext('ns:TimeMs', default='0', namespaces=namespace),
                            'AccuracyPercent': saccade.findtext('ns:AccuracyPercent', default='0', namespaces=namespace),
                        })
                    
                    if not saccade_summary or not saccade_data:
                        continue  # Skip if saccade data is missing

                    summary_df = pd.DataFrame(saccade_summary).astype(float)
                    saccade_df = pd.DataFrame(saccade_data).astype(float)

                    merged_df = pd.merge(summary_df, saccade_df, on='TestUID', how='inner')

                    # Compute statistics safely
                    num_sac = max(len(saccade_data), 1)  # Prevent division by zero
                    num_hyper = 100 * (saccade_df['AccuracyPercent'] > 120).sum() / num_sac
                    num_hyper_low = 100 * (saccade_df['AccuracyPercent'] > 110).sum() / num_sac
                    num_hypo = 100 * (saccade_df['AccuracyPercent'] < 70).sum() / num_sac
                    num_nor = 100 - num_hyper - num_hypo

                    results_summary = {
                        "Filename": os.path.splitext(xml_filename)[0],
                        "Percent normal": num_nor,
                        "Percent hyper": num_hyper,
                        "Percent hyper_low": num_hyper_low,
                        "Percent hypo": num_hypo
                    }
                    results_df = pd.concat([results_df, pd.DataFrame([results_summary])], ignore_index=True)

                    if plot:
                        plt.figure(figsize=(12, 6))
                        plt.plot(merged_df['TimeMs'], merged_df['AccuracyPercent'], 'o-', label='Accuracy')
                        plt.title(f'Saccade Waveform - {xml_filename}')
                        plt.xlabel('Time (ms)')
                        plt.ylabel('Accuracy (%)')
                        plt.legend()
                        plt.show()

    return results_df

root_directory = "/Volumes/LTY-Photos/DatasetCollection/PCMdata"
data_summary = plot_saccade_waveform(root_directory)
data_summary.to_csv('2023_7_12_saccade_data_summary.csv')

 39%|███▉      | 53/135 [00:01<00:01, 47.39it/s]

ignore zip file with 2 xml files!


 52%|█████▏    | 70/135 [00:01<00:01, 45.42it/s]

ignore zip file with 3 xml files!


 69%|██████▉   | 93/135 [00:01<00:00, 53.08it/s]

ignore zip file with 2 xml files!


100%|██████████| 135/135 [00:02<00:00, 48.52it/s]


ignore: ._Able,Carmaletta_95236649.zip
ignore: ._Riley,Althea_78541309 (2).zip
ignore: ._Riley,Althea_78541309.zip


 18%|█▊        | 22/122 [00:00<00:01, 52.43it/s]

ignore zip file with 2 xml files!
ignore zip file with 2 xml files!


 79%|███████▊  | 96/122 [00:02<00:00, 40.24it/s]

ignore zip file with 2 xml files!


100%|██████████| 122/122 [00:02<00:00, 45.53it/s]
100%|██████████| 105/105 [00:01<00:00, 53.94it/s]


ignore: ._Altindirek,Korhan_35360702.zip


 38%|███▊      | 44/117 [00:00<00:01, 53.20it/s]

ignore zip file with 2 xml files!


 50%|████▉     | 58/117 [00:01<00:01, 53.47it/s]

ignore zip file with 2 xml files!


 62%|██████▏   | 72/117 [00:01<00:00, 51.10it/s]

ignore zip file with 2 xml files!


 73%|███████▎  | 85/117 [00:01<00:00, 52.67it/s]

ignore zip file with 2 xml files!


 85%|████████▍ | 99/117 [00:01<00:00, 51.96it/s]

ignore zip file with 2 xml files!


100%|██████████| 117/117 [00:02<00:00, 50.16it/s]
  7%|▋         | 10/140 [00:00<00:01, 66.19it/s]

ignore zip file with 2 xml files!
ignore zip file with 2 xml files!


 19%|█▊        | 26/140 [00:00<00:01, 61.15it/s]

ignore zip file with 2 xml files!


 94%|█████████▍| 132/140 [00:02<00:00, 55.45it/s]

ignore zip file with 2 xml files!


100%|██████████| 140/140 [00:02<00:00, 52.26it/s]
  8%|▊         | 12/156 [00:00<00:02, 49.73it/s]

ignore zip file with 4 xml files!


 21%|██        | 33/156 [00:00<00:02, 53.05it/s]

ignore zip file with 2 xml files!
ignore zip file with 2 xml files!


 46%|████▌     | 72/156 [00:01<00:01, 56.22it/s]

ignore zip file with 2 xml files!
ignore zip file with 2 xml files!


 54%|█████▍    | 84/156 [00:01<00:01, 50.25it/s]

ignore zip file with 3 xml files!


 87%|████████▋ | 136/156 [00:02<00:00, 49.63it/s]

ignore zip file with 2 xml files!


100%|██████████| 156/156 [00:03<00:00, 50.13it/s]
