In [None]:
##
# This jupyter notebook parses ERG session csv files

In [146]:
import csv
import glob
import os
import re
import string

import numpy as np
import pandas as pd


In [194]:
## Constants
# Edit these

# Directory containing your input CSV files
DATA_INPUT_DIR = '/users/antoniae/Downloads/ERG_CSVfiles'
DATA_OUTPUT_NAME = 'erg_table.csv'

# Helper method to generate CSV cell locations ('A4') for each section and step
def csv_locs_step(key_row, step_row, stim_key_row, stim_step_row):
    return [
        { "label": f'A{key_row}', "value": f'A{step_row}' },                     # Step #
        { "label": f'H{stim_key_row}', "value": f'H{stim_step_row}' },           # StimFreq
        *[                                                                       # Step Data (a [ms] ... Avgs)
            { "label": f'{key_col}{key_row}', "value": f'{key_col}{step_row}' }
            for key_col in ["B", "C", "D", "E", "F"]
        ]
        
    ]

# List of CSV locations to parse, as pairs (location of the data label, and the location of the data value)
CSV_LOCS = [
    { "label": "A3", "value": "A4" }, # Patient
    { "label": "E3", "value": "E4" }, # TestDate
    *csv_locs_step(key_row=18, step_row=19, stim_key_row=6, stim_step_row=7),
    *csv_locs_step(key_row=18, step_row=20, stim_key_row=6, stim_step_row=8),
    *csv_locs_step(key_row=37, step_row=38, stim_key_row=6, stim_step_row=22),
    *csv_locs_step(key_row=37, step_row=29, stim_key_row=6, stim_step_row=23),
]


In [211]:
def load_filepaths_in_directory(directory):
    '''
        Search for and return all files (filepaths) in a directory
    '''
    print(f'Loading filepaths from "{directory}"')
    # Check that the directory exists
    exists = os.path.exists(directory)
    if not exists:
        print(f'Directory "${directory}" does not exist')
        return []

    else:
        # Gather all filepaths in this directory
        paths = glob.glob(os.path.join(directory, "*"))
        filepaths = []

        for path in paths:
            # Check that the filepath is for a file, and skip if directory
            if not os.path.isfile(path):
                continue

            print(" - " + path)
            filepaths.append(path)

        return filepaths


filepaths = load_filepaths_in_directory(DATA_INPUT_DIR)

Loading filepaths from "/users/antoniae/Downloads/ERG_CSVfiles"
 - /users/antoniae/Downloads/ERG_CSVfiles/V2989__20221103_1122.csv
 - /users/antoniae/Downloads/ERG_CSVfiles/V2832__20221101_1115.csv
 - /users/antoniae/Downloads/ERG_CSVfiles/V3001__20221103_1047.csv
 - /users/antoniae/Downloads/ERG_CSVfiles/V2996__20221102_1417.csv
 - /users/antoniae/Downloads/ERG_CSVfiles/erg_table.csv
 - /users/antoniae/Downloads/ERG_CSVfiles/V2993__20221102_1456.csv
 - /users/antoniae/Downloads/ERG_CSVfiles/V2996__20221102_1446.csv


In [216]:
loc_regex = '^([A-Za-z]+)([0-9]+)$'
def csv_location_to_index(loc):
    '''
        Convert a CSV cell location (e.g. 'A4') to row, col indeces (e.g. (3, 0))
    '''
    match = re.search(loc_regex, loc)

    row = int(match.group(2)) - 1
    col = ord(match.group(1).lower()) - 97
    return (row, col)


def load_erg_csv(filepath):
    '''
        Load a csv file and convert to np array of cells
    '''
    print(f'- Reading data from "{filepath}"')
    max_n_cols = 0
    lines = []
    mat = []

    # Open the CSV file at 'filepath', parse it with encoding "latin-1"
    #  (to handle additional character such as 'mu', etc)
    with open(filepath, 'r', encoding="latin-1") as csvfile:
        reader = csv.reader(csvfile)
        
        # Read line by line
        for row in reader:
            if len(row) > max_n_cols:
                max_n_cols = len(row)
            lines.append(row)
        
    # Convert lines to 2d np array for easier parsing
    for line in lines:
        line = np.array(line)
        line.resize((max_n_cols,))
        mat.append(line)

    mat = np.array(mat)
    return mat

def parse_erg_csv(mat):
    '''
        Parse out all entries in 'CSV_LOCS' from a CSV given as a np 2D array
    '''
    entries = []
    for data_entry in CSV_LOCS:
        label_loc = data_entry["label"]
        label_i = csv_location_to_index(label_loc)

        if (label_i[0] > mat.shape[0]):
            print(f'  - Could not parse csv')
            return []

        label = mat[label_i]
        
        value_loc = data_entry["value"]
        value_i = csv_location_to_index(value_loc)
        value = mat[value_i]
        
        entries.append({ "label": label, "value": value })
    return entries


def parse_erg_csvs(filepaths):
    '''
        
    '''
    print(f'Parsing data from files...')
    
    # Read each CSV, extracting relevant cells to list of (label, value) pairs
    files_data = []
    for filepath in filepaths:
        file_mat = load_erg_csv(filepath)
        file_parsed = parse_erg_csv(file_mat)
        if (len(file_parsed) > 0):
            files_data.append(file_parsed)

    
    # Create a Pandas CSV
    # Assume the labels in the first CSV are the same as the other CSVs,
    #  and use them as column headers for the output table
    files_keys = [[entry["label"] for entry in file_data] for file_data in files_data]
    cols = files_keys[0]

    df = pd.DataFrame(columns=cols)
    for file_data in files_data:
        vals = [entry["value"] for entry in file_data]
        row = pd.DataFrame([vals], columns=cols)
        df = pd.concat([df, row ], ignore_index=True)
    
    df.set_index("Patient")
    
    # Do some parsing for certain fields
    #  Remove ':' from "Step" cells
    df.loc[:, "Step"] = df.loc[:, "Step"].replace(to_replace=r':', value='', regex=True)

    print(df)
    return df


df = parse_erg_csvs(filepaths)


Parsing data from files...
- Reading data from "/users/antoniae/Downloads/ERG_CSVfiles/V2989__20221103_1122.csv"
- Reading data from "/users/antoniae/Downloads/ERG_CSVfiles/V2832__20221101_1115.csv"
- Reading data from "/users/antoniae/Downloads/ERG_CSVfiles/V3001__20221103_1047.csv"
- Reading data from "/users/antoniae/Downloads/ERG_CSVfiles/V2996__20221102_1417.csv"
- Reading data from "/users/antoniae/Downloads/ERG_CSVfiles/erg_table.csv"
  - Could not parse csv
- Reading data from "/users/antoniae/Downloads/ERG_CSVfiles/V2993__20221102_1456.csv"
- Reading data from "/users/antoniae/Downloads/ERG_CSVfiles/V2996__20221102_1446.csv"
  Patient               TestDate Step StimFreq a [ms] b [ms] a-wave [µV]  \
0   V2989  11/3/2022 11:22:25 AM   1     1.299   13.2   29.8       11.16   
1   V2832  11/1/2022 11:15:52 AM   1     1.299   10.7   25.4        5.26   
2   V3001  11/3/2022 10:47:23 AM   1     1.299   12.7   29.8       12.35   
3   V2996   11/2/2022 2:17:24 PM   1     1.299        

In [217]:
def save_data(df):
    filepath = os.path.join(DATA_INPUT_DIR, DATA_OUTPUT_NAME)
    print(f'Saving data to "{filepath}"')
    df.to_csv(filepath)
    
save_data(df)

Saving data to "/users/antoniae/Downloads/ERG_CSVfiles/erg_table.csv"
