## Combining Data
Takes all of the individual 804 recording files and combines them into one.

In [None]:
import numpy as np
import pandas as pd
import os
import re

# Get directory paths.
ecg_path = os.path.join('Data', 'ECG_Data')
class_path = os.path.join('Data', 'Class')

# Get lists of files.
ecg_files = [f for f in os.listdir(ecg_path) if os.path.isfile(
            os.path.join(ecg_path, f))]
control_files = [f for f in os.listdir(class_path) if os.path.isfile(
            os.path.join(class_path, f))]

In [None]:
file_nums = [int(re.findall("\d+", f)[0]) for f in ecg_files]

print(len(file_nums))
print(len(np.unique(file_nums)))

In [None]:
cols = ['time', 'interval', 'beat_type', 'annotation']
df_list = []

for filename in ecg_files[3:6]:
    print(filename)
    file_num = int(re.findall("\d+", filename)[0])         # Add a file number column extracted from the filename.

    # Read in each file, with each line read in as one whole string.
    tmp = pd.read_table(os.path.join('Data','ECG_Data',filename))
    
    # First split the line into pieces using regex and capturing groups (time, R-R interval, beat type & annotation)
    # From this, take the first two fields, and strip whitespace from the annotation, and add the file & row numbers.
    new_lines = [(file_num, i, tup[0], int(tup[1]), tup[3].strip()) for i, tup in enumerate ( \
                      [re.findall('([\d:]+) (\d+) (\w+) (.+)', tmp.iloc[i][0])[0] for i in range(len(tmp))])]
    
    adj_new_lines = [(t[0],t[1]) for t in new_lines]
    
    df_list.extend(adj_new_lines)

In [None]:
[t for t in df_list if t[1] < 30]

In [None]:
cols = ['time', 'interval', 'beat_type', 'annotation']
df_list = []

for filename in ecg_files:
    print(filename)
    file_num = int(re.findall("\d+", filename)[0])         # Add a file number column extracted from the filename.

    # Read in each file, with each line read in as one whole string.
    tmp = pd.read_table(os.path.join('Data','ECG_Data',filename))
    
    # First split the line into pieces using regex and capturing groups (time, R-R interval, beat type & annotation)
    # From this, take the first two fields, and strip whitespace from the annotation, and add the file & row numbers.
    new_lines = [(file_num, i, tup[0], int(tup[1]), tup[3].strip()) for i, tup in enumerate ( \
                      [re.findall('([\d:]+) (\d+) (\w+) (.+)', tmp.iloc[i][0])[0] for i in range(len(tmp))])]
    
    df_list.extend(new_lines)
    
df = pd.DataFrame(df_list, columns = ['file_num','row_num','time', 'interval', 'annotation'])

In [None]:
# df.to_csv(dirpath+'combined_series.csv', index=False)

In [None]:
cols_controls = ['time','control']
# df_controls = pd.read_csv(dirpath +'combined_controls.txt', delim_whitespace=True, names=cols)

In [None]:
outfilepath = os.path.join('Data', 'combined_controls.csv')

for file in control_files:
    df = pd.read_csv(os.path.join(class_path, file), 
                    delim_whitespace=True, names=cols_controls, 
                    dtype={'time':'object','control':'int8'})
    df['file_num'] = int(re.findall("\d+", file)[0]) 
        
#     df.to_csv(outfilepath, mode='a', header=False, index=False)

In [None]:
ecg_df = pd.read_csv(os.path.join('Data', 'combined_series.csv'), dtype={
                     'file_num': 'int16', 'row_num': 'uint32', 'time': 'object', 'interval': 'int16', 'annotation':'object'})

In [None]:
control_df = pd.read_csv(os.path.join('Data', 'combined_controls.csv'), dtype={
                         'file_num': 'int16', 'time': 'object', 'control': 'int8'})

In [None]:
ecg_df.shape