<a href="https://colab.research.google.com/github/sandeepaamcp/ECG_to_heart_rate_derivation/blob/dev/Extracting__heart_rate_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#pip install wfdb

In [0]:
import matplotlib.pyplot as plt
import wfdb
from wfdb import processing
import numpy as np
import linecache
import os, fnmatch
import time

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
def write_data(output_x_file_path, output_y_file_path, patient_data_file_path, heart_rate_values): 
  output_x_file= open(output_x_file_path,"w+")
  for index,value in np.ndenumerate(heart_rate_values):
    output_x_file.write("%.3f\n" % value)
    
  output_x_file.close() 

  patient_summary_file_path=patient_data_file_path + ".hea"

  lookup = 'Reason for admission:'
  linenum = 0
  with open(patient_summary_file_path) as f:
  # #with open('/content/drive/My Drive/HR_data/patient1/s0010_re.hea') as f:
      for num, line in enumerate(f, 1):
          if lookup in line:
              #print ('found at line:', num)
              linenum=num
              
  reason_for_admission=linecache.getline(patient_summary_file_path, linenum).rstrip()
  #y_str=linecache.getline('/content/drive/My Drive/HR_data/patient1/s0010_re.hea', linenum).rstrip()
  #y_str

  reason_for_admission = reason_for_admission.split(": ",1)[1]
  #print(reason_for_admission)

  categories = ["Myocardial infarction", "Cardiomyopathy", "Bundle branch block", "Dysrhythmia",
                "Myocardial hypertrophy", "Valvular heart disease", "Myocarditis", "Miscellaneous",
                "Healthy controls"]


  output_y_file = open(output_y_file_path,"w+")

  for idx, diseases in enumerate(categories):
    if reason_for_admission in diseases:
      print(idx)
      output_y_file.write("%d\n" % idx)
      break  
    elif len(categories)-1 == idx:
      #print("ERROR. Reason for admission is not given in patient summary")
      print(-1)
      output_y_file.write("%d\n" % idx)
  output_y_file.close()

In [0]:
def derive_heart_rate_from_ECG(ECG_input_data_file_path):
  record = wfdb.rdrecord(ECG_input_data_file_path, sampfrom=0, channels=[0])

  # Use the gqrs algorithm to detect qrs locations in the first channel
  qrs_inds = processing.gqrs_detect(sig=record.p_signal[:,0], fs=record.fs)

  # Correct the peaks shifting them to local maxima
  min_bpm = 20
  max_bpm = 230
  #min_gap = record.fs * 60 / min_bpm
  # Use the maximum possible bpm as the search radius
  search_radius = int(record.fs * 60 / max_bpm)
  corrected_peak_inds = processing.correct_peaks(record.p_signal[:,0], peak_inds=qrs_inds,
                                                search_radius=search_radius, smooth_window_size=150)


  hrs = processing.compute_hr(sig_len=record.p_signal.shape[0], qrs_inds=sorted(corrected_peak_inds), fs=record.fs)

  #print(hrs.size)
  heart_rate_raw = np.array([x for x in hrs if str(x) != 'nan'])
  # heartRate = hrs
  # print(heart_rate_raw)

  heart_rate_values = np.zeros(int(heart_rate_raw.shape[0]/1000)+1)
  #print(heart_rate_values.size)
  i = 0
  j = 0

  while i<heart_rate_raw.size:
    if i%1000==0:
      heart_rate_values[j]= heart_rate_raw[i]
      j+=1
    i+=1
  return heart_rate_values


In [0]:
#SERIAL LOOP
start_time = time.time()

counter = 0
k = 10
while k < 12:
  print(k)
  # Load the wfdb record and the physical samples
  if k < 10:
    input_data_file_path = r'/content/drive/My Drive/datasets/HR_data/patient00{}'
  elif k < 100:
    input_data_file_path = r'/content/drive/My Drive/datasets/HR_data/patient0{}'
  else:
    input_data_file_path = r'/content/drive/My Drive/datasets/HR_data/patient{}'

  hea_files = fnmatch.filter(os.listdir(input_data_file_path.format(k)), '*re.hea')
  #print(hea_files)
  for i in hea_files:
    #print(i.split(".")[0])
    data_file_path = input_data_file_path.format(k) + "/" + i.split(".")[0]
    heart_rate_values = derive_heart_rate_from_ECG(data_file_path)
    output_x_file_path=r"/content/drive/My Drive/datasets/test_results/x/"+str(counter)+".txt".format(k,i)
    output_y_file_path = r"/content/drive/My Drive/datasets/test_results/y/"+str(counter)+".txt".format(k,i)
    write_data(output_x_file_path, output_y_file_path, data_file_path, heart_rate_values )
    counter = counter + 1
  k+=1

elapsed_time_serial = time.time() - start_time
print("Elapsed Time:",elapsed_time_serial)

Atomic Counter for Python

References:

https://gist.github.com/benhoyt/8c8a8d62debe8e5aa5340373f9c509c7

https://eli.thegreenplace.net/2012/01/04/shared-counter-with-pythons-multiprocessing

This Code:

https://stackoverflow.com/questions/2080660/python-multiprocessing-and-a-shared-counter

In [0]:
#PARALLEL IMPLEMENTATION START
import threading

import multiprocessing
from multiprocessing import Pool

print(multiprocessing.cpu_count())

class AtomicCounter:
    def __init__(self, initval=0):
        self.val = multiprocessing.RawValue('i', initval)
        self.lock = multiprocessing.Lock()

    def increment(self):
        with self.lock:
            self.val.value += 1

    @property
    def value(self):
        return self.val.value

2


In [0]:
#NEED AN ATOMIC COUNTER TO INCREMENT
file_id_value = AtomicCounter(0)

In [0]:
def main_loop(data_file_path):
  global file_id_value
  heart_rate_values = derive_heart_rate_from_ECG(data_file_path)
  print(data_file_path)
  # print("file id:")
  # file_id.increment()
  # print(file_id.value)
  file_id_val = file_id_value.increment()
  # print(file_id_val)
  output_x_file_path=r"/content/drive/My Drive/datasets/test_results/x/"+str(file_id_value.value)+".txt"
  output_y_file_path = r"/content/drive/My Drive/datasets/test_results/y/"+str(file_id_value.value)+".txt"
  write_data(output_x_file_path, output_y_file_path, data_file_path, heart_rate_values )
  

In [20]:
start_time = time.time()
counter = 0
k = 100
end = 200
file_paths = []

while k < end:
  # Load the wfdb record and the physical samples
  if k < 10:
    input_data_file_path = r'/content/drive/My Drive/datasets/HR_data/patient00{}'
  elif k < 100:
    input_data_file_path = r'/content/drive/My Drive/datasets/HR_data/patient0{}'
  else:
    input_data_file_path = r'/content/drive/My Drive/datasets/HR_data/patient{}'
  try:
    hea_files = fnmatch.filter(os.listdir(input_data_file_path.format(k)), '*re.hea')
    for i in hea_files:
      data_file_path = input_data_file_path.format(k) + "/" + i.split(".")[0]
      file_paths.append(data_file_path)
      counter = counter + 1
    k+=1
  except OSError as e:
    print(e)
    counter = counter + 1
    k+=1

pool = Pool(multiprocessing.cpu_count()) # Create a multiprocessing Pool
pool.map(main_loop,file_paths)  # process data_inputs iterable with pool

elapsed_time_parallel = time.time() - start_time
print("Elapsed Time: ",elapsed_time_parallel)
# print("speedup: ", str(elapsed_time_serial/elapsed_time_parallel))

[Errno 2] No such file or directory: '/content/drive/My Drive/datasets/HR_data/patient124'
[Errno 2] No such file or directory: '/content/drive/My Drive/datasets/HR_data/patient132'
[Errno 2] No such file or directory: '/content/drive/My Drive/datasets/HR_data/patient134'
[Errno 2] No such file or directory: '/content/drive/My Drive/datasets/HR_data/patient161'
/content/drive/My Drive/datasets/HR_data/patient113/s0018lre
3
/content/drive/My Drive/datasets/HR_data/patient100/s0407lre
0
/content/drive/My Drive/datasets/HR_data/patient117/s0292lre
8
/content/drive/My Drive/datasets/HR_data/patient100/s0401lre
0
/content/drive/My Drive/datasets/HR_data/patient127/s0383lre
1
/content/drive/My Drive/datasets/HR_data/patient101/s0410lre
0
/content/drive/My Drive/datasets/HR_data/patient148/s0335lre
0
/content/drive/My Drive/datasets/HR_data/patient149/s0202bre
0
/content/drive/My Drive/datasets/HR_data/patient101/s0414lre
0
/content/drive/My Drive/datasets/HR_data/patient168/s0033_re
3
/conte

The speedup was almost the same, which signifies the issue of I/O bound.

REFERENCES:

https://en.wikipedia.org/wiki/I/O_bound 

https://stackoverflow.com/questions/42620323/why-is-reading-multiple-files-at-the-same-time-slower-than-reading-sequentially


In [0]:
#ATTEMPT 02:
#FILE READ BY ONE THREAD, DATA EXTRACTION BY ANOTHER THREAD


In [0]:
# import tempfile, shutil, os
# def create_temporary_copy(path,new_name):
#     temp_dir = tempfile.gettempdir()
#     temp_path = os.path.join(temp_dir, new_name)
#     shutil.copy2(path, temp_path)
#     return temp_path

In [0]:
# contents_all = []
# i = 0
# for file_path in file_paths:
#   # contents_all.append(create_temporary_copy(file_path+".dat",str(i)))
#   contents_all.append(create_temporary_copy("/content/drive/My Drive/datasets/HR_data/patient011",str(i)))
#   i+=1

In [0]:
# contents_all

In [0]:
# with open('/tmp/0', 'rb') as f:
#     file_contents=f.read()

In [0]:
# start_time = time.time()
# counter = 0
# k = 10
# end = 15
# file_paths = []
# folder_paths = []
# while k < end:
#   # Load the wfdb record and the physical samples
#   if k < 10:
#     input_data_file_path = r'/content/drive/My Drive/datasets/HR_data/patient00{}'
#   elif k < 100:
#     input_data_file_path = r'/content/drive/My Drive/datasets/HR_data/patient0{}'
#   else:
#     input_data_file_path = r'/content/drive/My Drive/datasets/HR_data/patient{}'
#   folder_paths.append(input_data_file_path)
#   hea_files = fnmatch.filter(os.listdir(input_data_file_path.format(k)), '*re.hea')
#   for i in hea_files:
#     data_file_path = input_data_file_path.format(k) + "/" + i.split(".")[0]
#     file_paths.append(data_file_path)
#     counter = counter + 1
#   k+=1

# pool = Pool(multiprocessing.cpu_count()) # Create a multiprocessing Pool
# pool.map(main_loop,file_paths)  # process data_inputs iterable with pool

# elapsed_time_parallel = time.time() - start_time
# print("Elapsed Time: ",elapsed_time_parallel)
# print("speedup: ", str(elapsed_time_serial/elapsed_time_parallel))

After extracting the data, it would be better to reorder them in the format of CSV to better feed to the model. The data is ordered so that the CSV file contains n+1 columns where n columns denote the heart rate values (x) and the last n+1th column shows the category of the disease (y).

In [0]:
#REORDER DATA IN CSV FORMAT
#CHECK THE NO OF ROWS IN DATA

import csv
import os, os.path

In [35]:
DIR_X = '/content/drive/My Drive/datasets/test_results/x'
DIR_Y = '/content/drive/My Drive/datasets/test_results/y'
files_count = len([name for name in os.listdir(DIR) if os.path.isfile(os.path.join(DIR_X, name))])
files_count

370

In [0]:
# to view the lengths of the heartrate data available
i = 1
while i <= files_count:
  num_lines = sum(1 for line in open(DIR_X+'/'+str(i)+'.txt'))
  # print(num_lines)
  i+=1

# for the csv file, a length of 100 is selected and the rest is excluded. Files
# with data points less than 100 are also excluded.

In [42]:
csv_output_file = '/content/drive/My Drive/datasets/test_results/csv/values.csv'
i = 1
all_values = []
while i <= files_count:
  file_name = DIR_X+'/'+str(i)+'.txt'
  num_lines = sum(1 for line in open(DIR_X+'/'+str(i)+'.txt'))
  if num_lines>100:
    values_array = []
    line_count = 0
    with open(file_name) as my_file:
      for line in my_file:
          values_array.append(line.rstrip())
          line_count+=1
          if line_count>100:
            break
    file_name = DIR_Y+'/'+str(i)+'.txt'
    with open(file_name) as f:
      line = f.readline()
      values_array.append(line.rstrip())
    print(values_array)
    # write to the csv file
    all_values.append(values_array)
  i+=1

with open(csv_output_file, 'w', newline='') as file:
  writer = csv.writer(file)
  writer.writerows(all_values)
  file.close()
  

['105.820', '106.762', '106.007', '106.007', '105.079', '105.820', '105.820', '104.895', '105.634', '106.007', '104.895', '105.263', '104.530', '105.448', '105.634', '104.895', '105.448', '105.634', '105.263', '105.448', '104.712', '105.448', '106.007', '105.448', '105.634', '106.007', '105.448', '105.634', '104.895', '105.634', '105.634', '105.634', '105.820', '106.195', '105.079', '105.820', '105.263', '106.007', '106.195', '105.634', '106.007', '106.383', '105.634', '106.195', '106.195', '106.007', '106.007', '106.762', '105.820', '106.007', '105.634', '106.007', '106.572', '105.820', '105.820', '106.572', '105.634', '105.820', '106.007', '105.820', '105.820', '105.448', '106.007', '106.195', '106.007', '105.448', '106.007', '105.079', '105.634', '105.448', '106.383', '105.634', '105.079', '106.383', '105.263', '105.263', '106.007', '105.079', '105.263', '105.634', '104.712', '105.079', '105.448', '103.986', '105.263', '105.448', '104.712', '105.079', '105.634', '105.079', '105.634'