To solve the ecg-classification task the **PTB XL ecg dataset** with 21799 clinical 12-lead ECGs from 18869 patients of 10 second length will be used.

This is a large public dataset that can be used for supervised learning.

More information about this dataset you can find here: https://physionet.org/content/ptb-xl/1.0.3/

In [1]:
import functools
import pathlib
import shutil
import requests
from tqdm.auto import tqdm

def download_file(url, dest_filename):
    """
    Download the file by its url saving it to the destination path (displaying the progress bar)
    """
    request = requests.get(url, stream=True, allow_redirects=True)
    
    if request.status_code != 200 :
        request.raise_for_status()  # will only raise for 4xx codes
        raise RuntimeError(f"Request to {url} returned status code {request.status_code}")
    
    file_size = int(request.headers.get('Content-Length', 0))

    dest_path = pathlib.Path(dest_filename).expanduser().resolve() # expand ~ and ~user constructs
    dest_path.parent.mkdir(parents=True, exist_ok=True) # if subdirectories are needed to create
    print(f"The file will be saved here: {dest_path}")

    desc = "(Unknown total file size)" if file_size == 0 else ""
    
    request.raw.read = functools.partial(request.raw.read, decode_content=True)  # Decompress if needed
    
    with tqdm.wrapattr(request.raw, "read", total=file_size, desc=desc) as r_raw :
        with dest_path.open("wb") as f :
            shutil.copyfileobj(r_raw, f)

    return dest_path

Download the PTB XL dataset via HTTP by its link. Note that the archive size is about 1.7 Gb.

In [2]:
from os import path as Path

# make sure that the link is valid by visiting the official resource page:
# https://physionet.org/content/ptb-xl/1.0.3/
ptb_xl_data_link = "https://physionet.org/static/published-projects/ptb-xl/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.3.zip"
data_dir = "../data"
dest_archive = Path.join(data_dir, "ptbxl.zip")

if not Path.exists(dest_archive) :
    download_file(ptb_xl_data_link, dest_archive)

For furher processing we need only:
- the files from the records500 folder
- ptbxl_database.csv

In [3]:
import os
import zipfile
import shutil
from zipfile import Path as ZipPath

def extract_from_zip(zip_file, paths, dest_folder, zip_folder='') :
    """
    Copy files and folders from zip file (or the specified folder in zip file) to the destination folder
    """
    if not Path.exists(zip_file) or not zipfile.is_zipfile(zip_file) or not paths : return
    if not Path.exists(dest_folder) : Path.mkdir(dest_folder)

    selected_paths = list(map(lambda x: fix_slash(Path.join(zip_file, zip_folder, x)), paths))
    zip_files = []
    
    with zipfile.ZipFile(zip_file) as z :
        for path in selected_paths :
            zip_files.extend(get_zip_files(ZipPath(z), path, dest_folder))
        # it's much faster to extract the whole archive than copy files iteratively
        all_zip_files = z.namelist()
        inner_zip_files = set(map(lambda x: x.replace(fix_slash(zip_file), "")[1:], zip_files))
        files_to_remove = list(set(all_zip_files) - inner_zip_files)
        files_to_remove = list(map(lambda x: f"{dest_folder}/{x}", files_to_remove))
        shutil.unpack_archive(zip_file, dest_folder) # unpack all the files to speed up process

    for zipf in files_to_remove : # remove unneeded files
        os.remove(zipf)


def get_zip_files(root, pattern, dest, flist = []) :
    """
    Get the list of all the file paths that match the pattern
    """
    for child in root.iterdir() :
        str_child = fix_slash(child)
        if child.is_file() :
            if str_child.startswith(pattern) :
               flist.append(str_child)
        if child.is_dir() :
            get_zip_files(child, str_child if str_child.startswith(pattern) else pattern, dest, flist)

    return flist

def fix_slash(path) : return str(path).replace("\\", "/").replace("\\\\", "/")

Let's copy the target files from the archive with the declared methods. Note, that there are thousands binary files, so it may take some time to copy them.

In [4]:
root_name = "ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.3"
raw_ecg_folder = "records500/"
annotations_file = "ptbxl_database.csv"
extracted_folder = fix_slash(Path.join(data_dir, root_name))

if not Path.exists(Path.join(extracted_folder, raw_ecg_folder)) :
    print("Start data extracting...")
    extract_from_zip(dest_archive, [raw_ecg_folder, annotations_file], data_dir, root_name)
    print(f"Data is extracted here {extracted_folder}")
else:
    print(f"The data is already extracted here {extracted_folder}")


Start data extracting...
Data is extracted here ../data/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.3
