In [28]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from datetime import datetime, time
import matplotlib.pyplot as plt
import math
import os
import sys
import re
path = '/home/ubuntu/eSportData/jupyter-data/VU_AMS/s1'
path_out = '/home/ubuntu/eSportData/jupyter-data/VU_AMS/samples'
path_tx = '/home/ubuntu/eSportData/jupyter-data/beatscope/Pliki/S1'
path_acc = '/home/ubuntu/eSportData/jupyter-data/akcelerometry/S1'

In [29]:

rejected = []
output_filename = "output.txt"
rejected_list = 'rejected.txt'
outputs = []
output_errors_path = '/home/ubuntu/eSportData/jupyter-data/VU_AMS/error_s1.txt'

In [24]:

def extract_time_from_file(filename):
    """
    function to read time from vuams files
    """
    try:
        with open(filename, 'r') as file:
            next(file)
            second_row = next(file).strip()
            parts = second_row.split("/")
            return str(parts[1])
    except FileNotFoundError:
        print(f"File '{filename}' not found.")
        return None


def merge_vs_tx(start_markers, vuams, beat, columns, da_df, tls, keys):
    """
    function to merge vuams df with beatscope files. All files have the same number of rows after adding the time column from vuams to each beatscope file.
    input : 
        start_markers: information about indexes where vuams markers start for the whole procedure
        vuams: vuams df with the first measurement merged with beatscope (to be able to freely iterate over the whole set of needed columns)
        columns: vuams and beatscope columns needed to assemble the dataframe
        da_df: vuams df before merging with the first beatscope
        tls: list with all beatscope dfs
        keys: dictionary of markers, keys -> vuams markers, values -> beatscope markers. All as strings
    output:
        merged vuams df with all beatscope files
    """
    for markers in start_markers[1:]:  # skip the first marker because the first beatscope file is already added to vuams using a simple merge
        marker_vms_dict = {1:0, 15:1, 21:2, 25:3, 35:4} # marker and corresponding beatscope index from the list of all
        keys_idx = marker_vms_dict.keys()
        da_sync_value = da_df.loc[markers, 'marker'] # and vice versa - marker value corresponding to the index
        da_sync_value = int(da_sync_value)
        da_sync_value = str(da_sync_value)
        print(da_sync_value, 'da sync')
        which_index = start_markers.index(markers) # information about iteration
        # try:
        tx_sync_idx_m1 = tls[which_index].index[tls[which_index]['marker_tx'] == keys[da_sync_value]][0] # find in the given beatscope from the list of beatscopes (tls) the index where the marker starts (m10, m11, etc.)
        print(tx_sync_idx_m1)
        rows_diff = tx_sync_idx_m1 - markers # row difference between beatscope and vuams
        print(rows_diff)

        for row in range(markers,len(vuams)): # iterate only from the index where the vuams marker starts
                for value_idx in range(len(vuams[row])):
                     
                    if vuams[markers][value_idx] in keys_idx: #in keys: # add to specific vuams columns data from beatscope columns considering row differences in dataframes
                        match = vuams[markers][value_idx]
                        try:
                                vuams[row][7] = beat[marker_vms_dict[match]][row+rows_diff][1]
                                vuams[row][8] = beat[marker_vms_dict[match]][row+rows_diff][2]
                                vuams[row][9] = beat[marker_vms_dict[match]][row+rows_diff][3]
                                vuams[row][10] = beat[marker_vms_dict[match]][row+rows_diff][4]
                                vuams[row][11] = beat[marker_vms_dict[match]][row+rows_diff][5]
                                vuams[row][12] = beat[marker_vms_dict[match]][row+rows_diff][6]
                                vuams[row][13] = beat[marker_vms_dict[match]][row+rows_diff][7]
                                vuams[row][14] = beat[marker_vms_dict[match]][row+rows_diff][8]
                                vuams[row][15] = beat[marker_vms_dict[match]][row+rows_diff][9]
                                vuams[row][16] = beat[marker_vms_dict[match]][row+rows_diff][10]
                                vuams[row][17] = beat[marker_vms_dict[match]][row+rows_diff][11]
                                vuams[row][-2] = beat[marker_vms_dict[match]][row+rows_diff][12]
                        except:
                                pass
                  
    merged_df = pd.DataFrame(vuams, columns =columns) # dataframe with merged information from beatscope
    return merged_df
def execute_and_store_output(filename, func, *args, **kwargs):
    try:
        sys.stdout = open(filename, 'w')
        func(*args, **kwargs)
    finally:
        sys.stdout.close()
        sys.stdout = sys.__stdout__

S1_p123 - -9999 dac na kanałach 2,3,4
S1_p236 - -9999 dac na kanałach 1
S1_p245 - -9999 dac na kanałach 2,3,4

In [30]:

def S1_processing(root, path_tx, path_acc, rej, output_errors, id):
    """
    main pipeline
    input:
        root: Main path to vuamsa files
        path_tx: path to beatscope files
        path_acc: path to accelerometers
        rej = empty list for people who for some reason were not processed
    output:
        processed databases and list of people who were not processed by the pipeline
    """
    with open(output_errors, 'w') as error_log:
                start_0 = None
                start_1 = None
                start_2 = None
                start_3 = None
                start_4 = None
                try:
                    
                    bs_columns = ['timestamp', 'SBP', 'DBP', 'MBP', 'HR', 'SV', 'LVET', 'PI', 'MS', 'CO', 'TPR', 'TPRCGS', 'marker_tx', 'dummy']
                    da_bs_columns = ['timestamp_x', 'ECG','DZ' ,'DZDT','Z0', 'marker', 'timestamp_y', 'SBP', 'DBP', 'MBP', 'HR', 'SV', 'LVET', 'PI', 'MS', 'CO', 'TPR', 'TPRCGS', 'marker_tx','dummy']
                    col_names = ['timestamp','ECG', 'DZ', 'DZDT', 'Z0', 'marker']
                    markers_keys = {'1':'m0', '15':'m1', '21':'m2', '25': 'm3', '35':'m4'} # vuams|beatscope marker dictionary for the first procedure saved as strings
                    ids = str(id)  # identifier of the person whose files are being processed
                    print(ids, '-person number')
        
                    file_names = []
                    extended_txs = []
                    
                    for filename in os.listdir(root):  # list of all files from paths
                        
                        if os.path.isfile(os.path.join(root, filename)):
                            file_names.append(filename)
                    
                    for filename in os.listdir(path_tx):
                        
                        if os.path.isfile(os.path.join(path_tx, filename)):
                            file_names.append(filename)
                    for filename in os.listdir(path_acc):
                        
                        if os.path.isfile(os.path.join(path_acc, filename)):
                            file_names.append(filename)
                    
                    #   create a pattern based on the identifier to include only files with relevant numbers in the list    
                    pattern = fr'(?<![0-9m]){re.escape(ids)}(?![0-9])'
                    subject_id = [file_name for file_name in file_names if re.search(pattern, file_name)]
                    # patterns for vuamsa and acc files from files selected for a given person based on id containing appropriate keywords from file names        
                    pattern_DZ = r'DZ[^A-Za-z]'
                    pattern_DZDT = r'DZDT'
                    pattern_ECG = r'ECG'
                    pattern_Z0 = r'Z0'
                    pattern_acc = fr'_{re.escape(ids)}'
                    
                    # merging DA
                    empty_rows_dz = pd.DataFrame({'index': [None] * 180000, 'DZ': [None] * 180000, 'marker1': [None] * 180000, 'marker2': [None] * 180000, 'dummy': [None] * 180000})
                    empty_rows_dzdt = pd.DataFrame({'index': [None] * 180000, 'DZDT': [None] * 180000, 'marker1': [None] * 180000, 'marker2': [None] * 180000, 'dummy': [None] * 180000})
                    empty_rows_ecg = pd.DataFrame({'index': [None] * 180000, 'ECG': [None] * 180000, 'marker1': [None] * 180000, 'marker2': [None] * 180000, 'dummy': [None] * 180000})
                    empty_rows_z0 = pd.DataFrame({'index': [None] * 180000, 'Z0': [None] * 180000, 'marker1': [None] * 180000, 'marker2': [None] * 180000, 'dummy': [None] * 180000})

                    for file_ in subject_id:
                        # Creating dataframes from all vuamsa files
                        if re.search(pattern_DZ, file_):
                            file_path_dz = os.path.join('/home/ubuntu/eSportData/jupyter-data/VU_AMS/s1/', file_)
                            df_dz = pd.read_csv(file_path_dz, sep=" ", header=None, names=['index', 'DZ', 'marker1', 'marker2', 'dummy'], skiprows=3)
                            df_dz = pd.concat([df_dz, empty_rows_dz], ignore_index=True)
                    
                        elif re.search(pattern_DZDT, file_):
                            file_path_dzdt = os.path.join('/home/ubuntu/eSportData/jupyter-data/VU_AMS/s1/', file_)
                            df_dzdt = pd.read_csv(file_path_dzdt, sep=" ", header=None, names=['index', 'DZDT', 'marker1', 'marker2', 'dummy'], skiprows=3)
                            df_dzdt = pd.concat([df_dzdt, empty_rows_dzdt], ignore_index=True)
                    
                        elif re.search(pattern_ECG, file_):
                            file_path_ecg = os.path.join('/home/ubuntu/eSportData/jupyter-data/VU_AMS/s1/', file_)
                            time = extract_time_from_file(file_path_ecg)  # reading time from the ECG file using a function
                            print(time)
                            df_ecg = pd.read_csv(file_path_ecg, sep=" ", header=None, names=['index', 'ECG', 'marker1', 'marker2', 'dummy'], skiprows=3)
                            df_ecg = pd.concat([df_ecg, empty_rows_ecg], ignore_index=True)
                    
                        elif re.search(pattern_Z0, file_):
                            file_path_z0 = os.path.join('/home/ubuntu/eSportData/jupyter-data/VU_AMS/s1/', file_)  # also extend Z0 to 1000ms from 250
                            df_z0 = pd.read_csv(file_path_z0, sep=" ", header=None, names=['index', 'Z0', 'marker1', 'marker2', 'dummy'], skiprows=3)
                            df_z0 = pd.concat([pd.DataFrame({'index': [df_z0['index'].min() - 1], 'Z0': [df_z0['Z0'].iloc[0]], 'marker1': [df_z0['marker1'].iloc[0]]}), df_z0])
                            new_index = range(df_z0['index'].min(), df_z0['index'].max() + 1)
                            df_z0 = df_z0.set_index('index').reindex(new_index)
                            df_z0 = df_z0.ffill()
                            df_z0 = df_z0.reset_index()
                            df_z0 = pd.concat([df_z0, empty_rows_z0], ignore_index=True)
                        elif re.search(pattern_acc, file_):
                            # acc_name = file_
                            acc_name = file_
                        
                    
                    ecg_col = df_ecg['ECG']
                    dz_col = df_dz['DZ']
                    dzdt_col = df_dzdt['DZDT']
                    z0_col = df_z0['Z0']
                    marker = df_ecg['marker2']
                    min_length = len(df_z0)
                    df_ecg = df_ecg.head(min_length)
                    df_dz = df_dz.head(min_length)
                    df_dzdt = df_dzdt.head(min_length)
                    data_dict = {'ECG': ecg_col, 'DZ': dz_col, 'DZDT': dzdt_col, 'Z0': z0_col, 'marker':marker}
                    da = pd.DataFrame(data_dict, index=df_dzdt.index, columns=col_names)
                    da.iloc[0,0] = time # attached read time to an empty column in the first position
                    da['timestamp'] = pd.to_datetime(da['timestamp'], format='%H:%M:%S.%f')

                    first_valid_index = 0

                    not_null_time = da.loc[first_valid_index, 'timestamp']
                    da_list = da.values.tolist()
                    for i in range(first_valid_index + 1, len(da_list)): # iterate through the entire length of the set attaching a time 1 ms greater than the previous one, starting from the attached initial time
                        not_null_time += timedelta(milliseconds=1)       # the operation is performed on the list instead of df to be faster
                       
                        da_list[i][0] = not_null_time
                    
                    da_full = pd.DataFrame(da_list, columns =col_names)
                    da_full['timestamp'] = da_full['timestamp'].dt.time
                    pattern = fr'(?<![0-9m]){re.escape(ids)}(?![0-9])' # patterns for beatscope files
                    txs = [tx for tx in subject_id if re.search(r'_m', tx)]
                    txs.sort() # sort them in a list so that I can iterate freely through them later when I attach them to the vuams df    
                    for tx in txs: # reading and extending to 1000ms immediately
                       
                        file_path_tx = os.path.join('/home/ubuntu/eSportData/jupyter-data/beatscope/Pliki/S1', tx)
                        tx_df = pd.read_csv(file_path_tx, sep=';', header=None, skiprows=10, encoding='cp1250', decimal=',', names=bs_columns)
                          
                        tx_df['timestamp'] = pd.to_datetime(tx_df['timestamp'], format='%H:%M:%S,%f')
                        # print(tx_df.timestamp[100])
                         # tx_df = tx_df[~tx_df.duplicated('timestamp', keep='first')]
    
                        tx_df = tx_df.set_index('timestamp')
                        tx_df = tx_df.resample('ms').ffill()
                        tx_df = tx_df.reset_index()
                        tx_df['timestamp'] = tx_df['timestamp'].dt.time
                        extended_txs.append(tx_df)
                            
                        extended_txs[-1].iloc[extended_txs[-1].index.max(), -2] = 'End' # add "end" label for the last beatscope in the last row to know where to cut if necessary
                    
                    da_sync_idx = da_full.index[da_full['marker'] != -9999][0] - 240000 # marker occurrence location - 4 minutes, because for some reason for baseline the markers did not appear at the beginning but after 4 minutes
                    da_sync_idx2 = da_full.index[da_full['marker'] != -9999][0] # marker start location (incorrect)
                    for i in range(da_sync_idx, da_sync_idx2):
                        da_full.iloc[i, 5] = 1  # fill the empty space where the 201 markers should be
                    
                    garry = extended_txs[0]
                    tx_sync_idx = garry.index[garry['marker_tx'].notna()][0] # find the marker from the first beatscope
                                
                    assert garry.at[tx_sync_idx, 'marker_tx'].startswith('m')
                                
                    rows_diff = tx_sync_idx - da_sync_idx
                    
                    if rows_diff > 0:  # 
                        garry = garry.iloc[rows_diff:]
                        garry = garry.reset_index(drop=True)
                    else:
                        da_full = da_full.iloc[-rows_diff:] # trim the vuams df so that the vuams marker appears in the same place as the beatscope marker
                        da_full = da_full.reset_index(drop=True) 
                    garry = garry.rename(columns = {'timestamp': 'timestamp_y'})
                    da_bc = da_full.merge(garry, how='left', left_index=True, right_index=True) # merge vuams with beatscope
                    time_col = da_full['timestamp']  # extract the time column from vuams
                    
                    trials = [pd.merge(time_col, tx, on='timestamp', how='left') for tx in extended_txs] # new list with beatscopes, with attached times from vuams
                    # print(trials[-1].iloc[-1], 'last time value of beatscope')
                    txs_list = [trial.values.tolist() for trial in trials]
                    # txs_list = [trial.values.tolist() for trial in trials[1:]]
                    # tx_list = txs_list[0]
                    da_bc_list = da_bc.values.tolist()
                    start_0 = da_bc.index[da_bc['marker'] ==1][0]
                    start_1 = da_bc.index[da_bc['marker'] ==15][0]
                    start_2 = da_bc.index[da_bc['marker'] == 21][0]
                    start_3 = da_bc.index[da_bc['marker'] == 25][0]
                    start_4 = da_bc.index[da_bc['marker'] == 35][0]
                    start_markers = []
                    for counter in range(5):
                        try:
                            start_marker = locals()[f'start_{counter}']
                            
                            if start_marker is not None:
                                start_markers.append(start_marker)
                        except:
                            pass
                   
                    df_tx = merge_vs_tx(start_markers, da_bc_list, txs_list, da_bs_columns, da_bc, trials, markers_keys) # assign a new dataframe from the merged vuams with beatscope function call
                   
                    try:
                #processing accelerometers
                        file_path_acc = os.path.join('/home/ubuntu/eSportData/jupyter-data/akcelerometry/S1', acc_name)
                        acc = pd.read_csv(file_path_acc, header=None, sep=';', encoding='cp1250', decimal=',', names = ['time', 'wr', 'tl', 'tr', '1', '2', '3'], usecols = ['time', 'wr', 'tl', 'tr'], skiprows = 11)
                        print(file_path_acc, 'acc')
                        
                        acc = acc.iloc[:-4]
                        # extending acc from 1000ms
                        acc['time'] = acc['time'].astype(str)
                        acc['time'] = pd.to_datetime(acc['time'])
                        
                        acc['timestamp'] = acc['time'].apply(lambda x: x + timedelta(microseconds=1000))
                        acc = acc.iloc[:-4]
                        acc['timestamp'] = pd.to_datetime(acc['timestamp'], format='%H:%M:%S.%f')
                        acc = acc.set_index('timestamp')
                        acc = acc.resample('ms').ffill()
                        acc = acc.reset_index()
                        acc['timestamp'] = acc['timestamp'].dt.time
                        acc = acc.drop(columns = ['time'])
                        print(acc.iloc[-1,0])
                        print(df_tx.iloc[-1,0])
                        acc_sync_value = df_tx.iloc[0,0] # trim the acc df to the dimensions of the merged vuams and beatscope df
                        acc_sync_idx = acc['timestamp'].index[acc['timestamp'] == acc_sync_value][0]
                        print('main path')
                                    # acc = acc.iloc[acc_sync_idx-60000:]
                        try: # for situations where for some reason the acc lasted shorter than the vuams 
                            acc_last_value = df_tx.iloc[-1,0]
                            acc_last_idx = acc['timestamp'].index[acc['timestamp'] == acc_last_value][0]
                            acc = acc.iloc[acc_sync_idx: acc_last_idx]
                            acc = acc.reset_index(drop=True)
                        except:
                            pass
                        df_tx = df_tx.drop(columns = ['timestamp_y'], axis = 1)
                        df_tx = df_tx.rename(columns={'timestamp_x': 'timestamp'})
                        df_all = df_tx.merge(acc, how='left', on = 'timestamp') # merge acc with vuams and beatscope and have the whole dataset
                        df_all['marker_y'] = df_all['marker']
                        df_all = df_all.drop(columns = ['dummy','marker', 'marker_tx'], axis = 1) # drop all unnecessary columns and rename them
                        df_all = df_all.rename(columns={'marker_y': 'marker'})
                    except:
                        print('alter path')
                        df_tx['wr'] = -9999
                        df_tx['tl'] = -9999
                        df_tx['tr'] = -9999
                        df_tx = df_tx.drop(columns = ['timestamp_y', 'dummy', 'marker_tx'], axis = 1)
                        df_tx = df_tx.rename(columns={'timestamp_x': 'timestamp', 'marker':'marker_old'})
                        df_tx['marker'] = df_tx['marker_old']
                        df_tx = df_tx.drop(columns = ['marker_old'], axis = 1)
                        df_all = df_tx
 
                    df_all['marker'] = df_all['marker'].fillna(-9999)
                    df_all = df_all.iloc[:-2]
                    print(df_all.iloc[0, -2])
                    print(df_all.iloc[-1, -2])    
                    # save = f'S2_p{ids}.csv'
                    print('file completed')
                    
                    df_all.to_csv(f'/home/ubuntu/eSportData/jupyter-data/VU_AMS/s1_output/S1_p{ids}.csv', index=False) # save using id
    
                
            # if a person cannot be processed, add id to the rejected list
    
                except Exception as e:
                    error_message = f"Error processing ID {ids}: {str(e)}"
                    print(error_message, file=error_log)

    return output_errors
repair = [232, 235] 
for i in repair:
    error_file = S1_processing(path, path_tx, path_acc, rejected, output_errors_path, i)



232 -numer osoby


  df_dz = pd.concat([df_dz, empty_rows_dz], ignore_index=True)
  df_z0 = pd.concat([df_z0, empty_rows_z0], ignore_index=True)
  df_dzdt = pd.concat([df_dzdt, empty_rows_dzdt], ignore_index=True)


08:55:50.500


  df_ecg = pd.concat([df_ecg, empty_rows_ecg], ignore_index=True)


check
1900-01-01 09:09:15.141000
1900-01-01 09:23:20.068000
1900-01-01 10:23:04.338000
1900-01-01 10:29:21.470000
1900-01-01 10:37:45.296000
check2
5
5 długość start markers
15 hjeolo
910740
51186
21 hjeolo
4486732
51734
25 hjeolo
4862846
52029
35 hjeolo
5368022
51662
check 3
/home/ubuntu/eSportData/jupyter-data/akcelerometry/S1/S1_232.csv acc


  acc['time'] = pd.to_datetime(acc['time'])


10:40:24.001000
10:41:29.285000
main path
0.0
nan
check4
done
235 -numer osoby
16:41:26.500


  df_ecg = pd.concat([df_ecg, empty_rows_ecg], ignore_index=True)
  df_z0 = pd.concat([df_z0, empty_rows_z0], ignore_index=True)
  df_dz = pd.concat([df_dz, empty_rows_dz], ignore_index=True)
  df_dzdt = pd.concat([df_dzdt, empty_rows_dzdt], ignore_index=True)


check
1900-01-01 16:50:00.835000
1900-01-01 17:03:59.344000
1900-01-01 17:41:26.590000
1900-01-01 17:48:23.778000
1900-01-01 17:57:07.721000
check2
5
5 długość start markers
15 hjeolo
942691
50859
21 hjeolo
3193638
51325
25 hjeolo
3592642
50872
35 hjeolo
4131545
50993
check 3
/home/ubuntu/eSportData/jupyter-data/akcelerometry/S1/S1_235.csv acc


  acc['time'] = pd.to_datetime(acc['time'])


18:13:52.001000
18:00:52.517000
main path
0.0
0.0
check4
done


In [None]:
%system free -m