# 데이터 로더

    데이터 로더에 dataset을 정의하려면 x y데이터가 있어야해서 Physionet 예제코드를 참고하여
    x y를 간단히 정의해주고 데이터 로더를 만든다.

### 대회 예제 코드 (helper code)

In [8]:
#!/usr/bin/env python

# Do *not* edit this script.
# These are helper functions that you can use with your code.

import os, numpy as np

# Check if a variable is a number or represents a number.
def is_number(x):
    try:
        float(x)
        return True
    except (ValueError, TypeError):
        return False

# Check if a variable is an integer or represents an integer.
def is_integer(x):
    if is_number(x):
        return float(x).is_integer()
    else:
        return False

# Check if a variable is a a finite number or represents a finite number.
def is_finite_number(x):
    if is_number(x):
        return np.isfinite(float(x))
    else:
        return False

# (Re)sort leads using the standard order of leads for the standard twelve-lead ECG.
def sort_leads(leads):
    x = ('I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6')
    leads = sorted(leads, key=lambda lead: (x.index(lead) if lead in x else len(x) + leads.index(lead)))
    return tuple(leads)

# Find header and recording files.
def find_challenge_files(data_directory):
    header_files = list()
    recording_files = list()
    for f in sorted(os.listdir(data_directory)):
        root, extension = os.path.splitext(f)
        if not root.startswith('.') and extension=='.hea':
            header_file = os.path.join(data_directory, root + '.hea')
            recording_file = os.path.join(data_directory, root + '.mat')
            if os.path.isfile(header_file) and os.path.isfile(recording_file):
                header_files.append(header_file)
                recording_files.append(recording_file)
    return header_files, recording_files

# Load header file as a string.
def load_header(header_file):
    with open(header_file, 'r') as f:
        header = f.read()
    return header

# Load recording file as an array.
def load_recording(recording_file, header=None, leads=None, key='val'):
    from scipy.io import loadmat
    recording = loadmat(recording_file)[key]
    if header and leads:
        recording = choose_leads(recording, header, leads)
    return recording

# Choose leads from the recording file.
def choose_leads(recording, header, leads):
    num_leads = len(leads)
    num_samples = np.shape(recording)[1]
    chosen_recording = np.zeros((num_leads, num_samples), recording.dtype)
    available_leads = get_leads(header)
    for i, lead in enumerate(leads):
        if lead in available_leads:
            j = available_leads.index(lead)
            chosen_recording[i, :] = recording[j, :]
    return chosen_recording

# Get recording ID.
def get_recording_id(header):
    recording_id = None
    for i, l in enumerate(header.split('\n')):
        if i==0:
            try:
                recording_id = l.split(' ')[0]
            except:
                pass
        else:
            break
    return recording_id

# Get leads from header.
def get_leads(header):
    leads = list()
    for i, l in enumerate(header.split('\n')):
        entries = l.split(' ')
        if i==0:
            num_leads = int(entries[1])
        elif i<=num_leads:
            leads.append(entries[-1])
        else:
            break
    return tuple(leads)

# Get age from header.
def get_age(header):
    age = None
    for l in header.split('\n'):
        if l.startswith('#Age'):
            try:
                age = float(l.split(': ')[1].strip())
            except:
                age = float('nan')
    return age

# Get sex from header.
def get_sex(header):
    sex = None
    for l in header.split('\n'):
        if l.startswith('#Sex'):
            try:
                sex = l.split(': ')[1].strip()
            except:
                pass
    return sex

# Get number of leads from header.
def get_num_leads(header):
    num_leads = None
    for i, l in enumerate(header.split('\n')):
        if i==0:
            try:
                num_leads = float(l.split(' ')[1])
            except:
                pass
        else:
            break
    return num_leads

# Get frequency from header.
def get_frequency(header):
    frequency = None
    for i, l in enumerate(header.split('\n')):
        if i==0:
            try:
                frequency = float(l.split(' ')[2])
            except:
                pass
        else:
            break
    return frequency

# Get number of samples from header.
def get_num_samples(header):
    num_samples = None
    for i, l in enumerate(header.split('\n')):
        if i==0:
            try:
                num_samples = float(l.split(' ')[3])
            except:
                pass
        else:
            break
    return num_samples

# Get analog-to-digital converter (ADC) gains from header.
def get_adc_gains(header, leads):
    adc_gains = np.zeros(len(leads))
    for i, l in enumerate(header.split('\n')):
        entries = l.split(' ')
        if i==0:
            num_leads = int(entries[1])
        elif i<=num_leads:
            current_lead = entries[-1]
            if current_lead in leads:
                j = leads.index(current_lead)
                try:
                    adc_gains[j] = float(entries[2].split('/')[0])
                except:
                    pass
        else:
            break
    return adc_gains

# Get baselines from header.
def get_baselines(header, leads):
    baselines = np.zeros(len(leads))
    for i, l in enumerate(header.split('\n')):
        entries = l.split(' ')
        if i==0:
            num_leads = int(entries[1])
        elif i<=num_leads:
            current_lead = entries[-1]
            if current_lead in leads:
                j = leads.index(current_lead)
                try:
                    baselines[j] = float(entries[4].split('/')[0])
                except:
                    pass
        else:
            break
    return baselines

# Get labels from header.
def get_labels(header):
    labels = list()
    for l in header.split('\n'):
        if l.startswith('#Dx'):
            try:
                entries = l.split(': ')[1].split(',')
                for entry in entries:
                    labels.append(entry.strip())
            except:
                pass
    return labels

# Save outputs from model.
def save_outputs(output_file, recording_id, classes, labels, probabilities):
    # Format the model outputs.
    recording_string = '#{}'.format(recording_id)
    class_string = ','.join(str(c) for c in classes)
    label_string = ','.join(str(l) for l in labels)
    probabilities_string = ','.join(str(p) for p in probabilities)
    output_string = recording_string + '\n' + class_string + '\n' + label_string + '\n' + probabilities_string + '\n'

    # Save the model outputs.
    with open(output_file, 'w') as f:
        f.write(output_string)

# Load outputs from model.
def load_outputs(output_file):
    with open(output_file, 'r') as f:
        for i, l in enumerate(f):
            if i==0:
                recording_id = l[1:] if len(l)>1 else None
            elif i==1:
                classes = tuple(entry.strip() for entry in l.split(','))
            elif i==2:
                labels = tuple(entry.strip() for entry in l.split(','))
            elif i==3:
                probabilities = tuple(float(entry) if is_finite_number(entry) else float('nan') for entry in l.split(','))
            else:
                break
    return recording_id, classes, labels, probabilities

#-------------------------------------------------------------------#
# team_code 발췌

def get_features(header, recording, leads):
    # Extract age.
    age = get_age(header)
    if age is None:
        age = float('nan')

    # Extract sex. Encode as 0 for female, 1 for male, and NaN for other.
    sex = get_sex(header)
    if sex in ('Female', 'female', 'F', 'f'):
        sex = 0
    elif sex in ('Male', 'male', 'M', 'm'):
        sex = 1
    else:
        sex = float('nan')

    # Reorder/reselect leads in recordings.
    recording = choose_leads(recording, header, leads)

    # Pre-process recordings.
    adc_gains = get_adc_gains(header, leads)
    baselines = get_baselines(header, leads)
    num_leads = len(leads)
    for i in range(num_leads):
        recording[i, :] = (recording[i, :] - baselines[i]) / adc_gains[i]

    # Compute the root mean square of each ECG lead signal.
    rms = np.zeros(num_leads)
    for i in range(num_leads):
        x = recording[i, :]
        rms[i] = np.sqrt(np.sum(x**2) / np.size(x))

    return age, sex, rms

### feature, label 뽑기 (team_code 참고)

In [10]:
# 대회의 평가 기준에서는 2리드, 4리드, 8리드 등, 리드 셋마다의 모델을 따로 평가 하지만 일단 데이터로더에서는 적용하지 않음.
twelve_leads = ('I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6')
data_directory = 'E:/ECG/Data/WFDB_CPSC2018_2'
header_files, recording_files = find_challenge_files(data_directory)
num_recordings = len(recording_files)

classes = set()
for header_file in header_files:
    header = load_header(header_file)
    classes |= set(get_labels(header))
if all(is_integer(x) for x in classes):
    classes = sorted(classes, key=lambda x: int(x)) # Sort classes numerically if numbers.
else:
    classes = sorted(classes) # Sort classes alphanumerically if not numbers.
num_classes = len(classes)

data = np.zeros((num_recordings, 14), dtype=np.float32) # 14 features: one feature for each lead, one feature for age, and one feature for sex
labels = np.zeros((num_recordings, num_classes), dtype=np.bool_) # One-hot encoding of classes

for i in range(num_recordings):
    print('    {}/{}...'.format(i+1, num_recordings))
    
    # Load header and recording.
    header = load_header(header_files[i])
    recording = load_recording(recording_files[i])

    # Get age, sex and root mean square of the leads.
    age, sex, rms = get_features(header, recording, twelve_leads)
    data[i, 0:12] = rms
    data[i, 12] = age
    data[i, 13] = sex

    current_labels = get_labels(header)
    for label in current_labels:
        if label in classes:
            j = classes.index(label)
            labels[i, j] = 1

    1/3453...
    2/3453...
    3/3453...
    4/3453...
    5/3453...
    6/3453...
    7/3453...
    8/3453...
    9/3453...
    10/3453...
    11/3453...
    12/3453...
    13/3453...
    14/3453...
    15/3453...
    16/3453...
    17/3453...
    18/3453...
    19/3453...
    20/3453...
    21/3453...
    22/3453...
    23/3453...
    24/3453...
    25/3453...
    26/3453...
    27/3453...
    28/3453...
    29/3453...
    30/3453...
    31/3453...
    32/3453...
    33/3453...
    34/3453...
    35/3453...
    36/3453...
    37/3453...
    38/3453...
    39/3453...
    40/3453...
    41/3453...
    42/3453...
    43/3453...
    44/3453...
    45/3453...
    46/3453...
    47/3453...
    48/3453...
    49/3453...
    50/3453...
    51/3453...
    52/3453...
    53/3453...
    54/3453...
    55/3453...
    56/3453...
    57/3453...
    58/3453...
    59/3453...
    60/3453...
    61/3453...
    62/3453...
    63/3453...
    64/3453...
    65/3453...
    66/3453...
    67/3453...
    

array([0.07874008, 0.01414214, 0.06      , 0.        , 0.        ,
       0.        , 0.08124038, 0.168523  , 0.16124515, 0.29051678,
       0.26720778, 0.15297059])

### 데이터 로더

In [22]:
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader

class ECGDataset(Dataset):
    """ ECG dataset."""

    # Initialize your data, download, etc.
    def __init__(self):
        
        self.x_data = torch.from_numpy(data)
        self.y_data = torch.from_numpy(labels)

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return len(self.x_data)


dataset = ECGDataset()
train_loader = DataLoader(dataset=dataset,
                          batch_size=32,
                          shuffle=True,
                          num_workers=2)

In [29]:
data[0]

array([0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.6370706e-01,
       2.5884357e-01, 1.9183326e-01, 3.7416574e-02, 0.0000000e+00,
       5.3000000e+01, 1.0000000e+00], dtype=float32)

## 모델 적용 전 고칠 것

    - 현재 data 인풋 ( 현재는 환자 1명의 각 리드의 rms(root mean square), 성별, 나이 => 14개 인풋 )
    - 평가 기준에 맞는 리드셋 모델 ( 대회의 평가 기준에서는 2리드, 4리드, 8리드 등, 리드 셋마다의 모델을 따로 평가 하지만 일단 데이터로더에서는 적용하지 않음.)
    - torch 사용자 정의 dataset() >> 데이터 로드 후 전처리?