# Extract Transform Load
This file downloads the D1NAMO dataset, extracts it and processes the dataset to extract the data we need and saves it a csv file.
The dataset is ~10GB download and is ~65 GB uncompressed. Please choose a disk with over 100 GB space to allow for other files if processing on the cloud

### Download the dataset
Uses Linux wget command to download a remote resource

In [14]:
#!wget https://zenodo.org/record/1421616/files/D1NAMO.tgz

### Extract the dataset

In [15]:
# Extract the dataset - WARNING takes a long time
# import tarfile
# tar = tarfile.open('D1NAMO.tgz', "r:gz")
# tar.extractall()
# tar.close()

In [16]:
#view size of the database 
# !du -sh D1NAMO

In [17]:
# Get ECG files from D1NAMO dataset
import os
ECG_files = []
for subdir, dirs, files in os.walk('D1NAMO'):
    for file in files:
        #print os.path.join(subdir, file)
        filepath = subdir + os.sep + file

        if filepath.endswith("ECG.csv"):
            ECG_files.append(filepath)
            
print('Number of ECG files:' , len(ECG_files))

Number of ECG files: 131


### Processing ECG files
Runs a script to loop through the database directory and subdirectories and finds files ending with `ECG.csv`. Then reads a minute of ECG data and processes it and creates a pandas dataframe. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.signal import find_peaks, argrelmin
from scipy.interpolate import interp1d
#import the function to conver ecg signals to the parameters we are inputing to the model
# from ipynb.fs.full.processing_function import signal_to_params

In [19]:
def signal_to_params(filename,sampling_freq=250,n=15, duration=1):
    filename = filename
    bvp_signal = pd.read_csv(filename);
    #remove first two rows
    bvp_array = bvp_signal.iloc[2:];
    #get row data of a minute starting from 3rd minute allow time to stabilize
    minute_length = int(sampling_freq * 60)
    bvp_numpy = bvp_array[(n)*minute_length:(n+duration)*minute_length].to_numpy()
    #convert to 1d array
    bvp_numpy_one_d_array = bvp_numpy[:,1]
    baseline = np.quantile(bvp_numpy_one_d_array, 0.75)
    peaks, _ = find_peaks(bvp_numpy_one_d_array, height = baseline, distance = 100)


    #distance of peaks
    peak_distance = np.diff(peaks)

    #Mean R-R interval in seconds
    average_peak_distance = sum(peak_distance)/len(peak_distance)/sampling_freq # Mean RR
    peak_distance_std = np.std(peak_distance); #SDNN
    peak_distance_rms =  np.sqrt(np.mean(peak_distance**2)) #RMSSD

    #creat array of hr and hrv
    heart_rate_array = [];
    heart_rate_variability_array = [];
    nn_50 = 0
    # #
    # print("peak")
    # print(len(peak_distance))
    # print(peak_distance)
    # #

    for x in range(1, len(peak_distance)):
        #heart rate
        heart_rate = 60 * sampling_freq / peak_distance[x-1];
        heart_rate_array.append(heart_rate);

        #heart rate variability N-N intervals
        heart_rate_variability = abs((peak_distance[x] - peak_distance[x-1]) * 1000 / sampling_freq);
        heart_rate_variability_array.append(int(heart_rate_variability));

        #no. of hrv intervals differ more than 50ms
        if (heart_rate_variability > 50):
            nn_50 += 1;

    #mean and std - heart rate - BPM
    heart_rate_mean = np.mean(heart_rate_array);
    heart_rate_std = np.std(heart_rate_array);



    #mean and std - heart rate variability per minute
    heart_rate_variability_mean = np.mean(heart_rate_variability_array)
    heart_rate_variability_std = np.std(heart_rate_variability_array)


    #no. and percentage of hrv intervals differ more than 50ms
    pNN_50 = nn_50 / len(peak_distance);

    #Triangular interpolation index
    peaks_min = argrelmin(bvp_numpy_one_d_array)

    min_row = [];
    dif_array = [];
    dif = 36;

    for x in range(1, len(peaks_min[0])):
        if (bvp_numpy_one_d_array[peaks_min[0][x-1]] > 0):
            dif = abs(peaks_min[0][x-1] - peaks_min[0][x-2]);
            if (dif > 50): #control threshold for TINN
                min_row.append(peaks_min[0][x-1]);
                dif = dif * 1000 / sampling_freq;
                dif_array.append(dif);


    TINN = np.mean(dif_array);

    sum_hrv_square= 0;
    #rmsHRV
    for x in range(1, len(heart_rate_variability_array)):
        sum_hrv_square = (heart_rate_variability_array[x-1] ** 2) + sum_hrv_square;

    average_hrv_square = sum_hrv_square / len(heart_rate_variability_array);
    rmsHRV = np.sqrt(average_hrv_square);

    intergral_rr = 0
    [freq_RR , interval] = np.histogram(peak_distance, bins=20) #heruristic to set

    for x in range (1,len(freq_RR)):
        intergral_rr = intergral_rr + freq_RR[x-1]*(interval[x]-interval[x-1])
    triangular_index = intergral_rr/max(freq_RR)
    
    parameters = {'nn_50':nn_50,'pNN_50': pNN_50, 'meanHR': heart_rate_mean, 'SDHR': heart_rate_std, 'meanRR':average_peak_distance,'SDNN': peak_distance_std,'RMSSD': peak_distance_rms, 'TINN':TINN, 'HRVTriIndex':triangular_index}
    return parameters

In [20]:
df_cols = ['filename','SDNN','RMSSD','nn_50','pNN_50', 'meanHR', 'SDHR', 'meanRR','TINN','HRVTriIndex']
df = pd.DataFrame(columns = df_cols)

In [21]:
for index, file in enumerate(ECG_files):
#     print (file, index)
    params = signal_to_params(file, 250, 15)
    params['filename']=file
    df = df.append([params], ignore_index = True)
df

Unnamed: 0,filename,SDNN,RMSSD,nn_50,pNN_50,meanHR,SDHR,meanRR,TINN,HRVTriIndex
0,D1NAMO/diabetes_subset/001/sensor_data/2014_10...,215.209703,322.444277,51,0.836066,86.661102,34.711019,0.960459,1315.211268,129.529412
1,D1NAMO/diabetes_subset/001/sensor_data/2014_10...,95.970977,215.556122,43,0.558442,89.377405,28.844218,0.772052,999.878788,71.082353


In [22]:
df.to_csv('processed_data.csv', index= False)

## Model Creation

In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

dataset = pd.read_csv('processed_data.csv')
def labelDataset(row):
#     label the dataset 1- diabetic , 0- non-diabetic
    if row['filename'][7:15] =="diabetes":
        return  1
    else:
        return 0
dataset['label'] = dataset.apply (lambda row: labelDataset(row), axis=1)
dataset = dataset.dropna()
dataset.head()

Unnamed: 0,filename,SDNN,RMSSD,nn_50,pNN_50,meanHR,SDHR,meanRR,TINN,HRVTriIndex,label
0,D1NAMO/diabetes_subset/001/sensor_data/2014_10...,215.209703,322.444277,51.0,0.836066,86.661102,34.711019,0.960459,1315.211268,129.529412,1
1,D1NAMO/diabetes_subset/001/sensor_data/2014_10...,95.970977,215.556122,43.0,0.558442,89.377405,28.844218,0.772052,999.878788,71.082353,1
2,D1NAMO/diabetes_subset/001/sensor_data/2014_10...,39.235031,179.952804,41.0,0.482353,90.749579,24.051391,0.702494,1019.521739,33.2,1
3,D1NAMO/diabetes_subset/001/sensor_data/2014_10...,669.571593,781.183077,26.0,0.787879,105.488401,46.878402,1.609576,1098.29703,144.704,1
4,D1NAMO/diabetes_subset/009/sensor_data/2014_10...,403.168203,532.586707,30.0,0.75,85.175555,50.759611,1.392,1249.022222,187.454348,1


In [27]:
from scipy import stats
z_scores = stats.zscore(dataset.drop('filename', axis=1))
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3.5).all(axis=1)
dataset = dataset[filtered_entries]

In [28]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(dataset, test_size=0.2)
train.head()

Unnamed: 0,filename,SDNN,RMSSD,nn_50,pNN_50,meanHR,SDHR,meanRR,TINN,HRVTriIndex,label
23,D1NAMO/diabetes_subset/002/sensor_data/2014_10...,16.402032,140.001502,12.0,0.11215,108.975068,9.261988,0.55615,1283.266667,10.047414,1
114,D1NAMO/healthy_subset/003/sensor_data/2014_10_...,133.853248,265.436216,55.0,0.846154,82.191084,33.908568,0.916862,1288.157895,116.97,0
124,D1NAMO/healthy_subset/016/sensor_data/2014_10_...,351.045266,509.158557,36.0,0.923077,67.391692,38.14999,1.475179,1362.666667,227.05,0
11,D1NAMO/diabetes_subset/007/sensor_data/2014_10...,22.181699,212.333229,15.0,0.214286,72.114855,10.446695,0.844686,1052.294737,28.558333,1
6,D1NAMO/diabetes_subset/009/sensor_data/2014_10...,670.544096,767.555685,25.0,0.675676,108.779267,46.989341,1.494054,1012.769231,194.721429,1


In [5]:
# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

2.2.0


In [29]:
model = keras.Sequential([
    keras.layers.InputLayer(input_shape=(9)),
    keras.layers.Dense(20, activation='relu'),
    keras.layers.Dense(1, activation='relu')
])

In [30]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 20)                200       
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 21        
Total params: 221
Trainable params: 221
Non-trainable params: 0
_________________________________________________________________


In [31]:
train_data = train.drop(['label','filename'], axis=1)
test_data = test.drop(['label','filename'], axis=1)
train_stats = train_data.describe()
train_stats = train_stats.transpose()
print(type(train_stats))
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']
normed_train_data = norm(train_data)
normed_test_data = norm(test_data)

<class 'pandas.core.frame.DataFrame'>


In [32]:
model.fit(normed_train_data, train['label'], epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x20dec22ef10>

In [33]:
test_loss, test_acc = model.evaluate(normed_test_data,  test['label'], verbose=2)
print('\nTest accuracy:', test_acc)

1/1 - 0s - loss: 0.6713 - accuracy: 0.8077

Test accuracy: 0.807692289352417


In [35]:
model.save("D1NAMO.h5")
print("Saved model to disk")

Saved model to disk
