In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2

import logging

logging.basicConfig(level=logging.INFO)
import tsdm

In [None]:
tsdm.datasets.MIMIC_IV(version=2.0)

In [None]:
import logging
import subprocess
from collections.abc import Mapping
from io import IOBase
from pathlib import Path
from typing import Any, Optional
from urllib.parse import urlparse

import requests
from tqdm.autonotebook import tqdm

from tsdm.constants import EMPTY_MAP
from tsdm.types.aliases import PathLike
from tsdm.utils.hash import validate_file_hash


def download_io(
    file: IOBase,
    url: str,
    *,
    session,
    chunk_size: int = 1024,
) -> None:
    """Download a file from a URL to an IO stream."""
    response = session.get(url)  # type: ignore[arg-type]

    with tqdm(
        desc=f"Downloading {url}",
        total=int(response.headers.get("content-length", 0)),
        unit="iB",
        unit_scale=True,
        unit_divisor=1024,
    ) as progress_bar:
        for data in response.iter_content(chunk_size=chunk_size):
            if data:  # filter out keep-alive new chunks
                print(type(data), data)
                size = file.write(data)
                progress_bar.update(size)

In [None]:
import os
from zipfile import ZipFile

import requests
from bs4 import BeautifulSoup
from requests.auth import HTTPBasicAuth

# Define the base URL and authentication credentials
base_url = "https://physionet.org/files/mimiciv/2.0/"
username = getuser()
password = getpass()
headers = {"User-Agent": "Wget/1.21.2"}

response = requests.get(
    base_url, auth=HTTPBasicAuth(username, password), headers=headers
)
assert response.status_code == 200, f"Reponse is {response.status_code}"
soup = BeautifulSoup(response.text, "html.parser")

In [None]:
# Function to download all files in a directory recursively and store in a single zip file
def download_directory_to_zip(url, zip_filename, session=None):
    if session is None:
        session = requests.Session()
        session.auth = (username, password)
        session.headers.update(headers)

    response = session.get(url)
    assert response.status_code == 200, f"Reponse is {response.status_code}"
    soup = BeautifulSoup(response.text, "html.parser")
    print([link.get("href") for link in soup.find_all("a")])

    with ZipFile(zip_filename, "w", compression=zipfile.ZIP_DEFLATED) as archive:
        for link in (pbar := tqdm(soup.find_all("a"), desc=f"Download from {url}")):
            href = link.get("href")
            pbar.set_postfix(href=href)
            if href == "../":
                continue
            if href.endswith("/"):
                # Recursively download sub-directories
                sub_url = url + href
                download_directory_to_zip(sub_url, zip_filename, session=session)
            else:
                # Download non-directory files into the zip file
                file_url = url + href
                file_name = os.path.basename(href)
                with archive.open(file_name, "w") as file:
                    download_io(file, file_url, session=session)


zip_filename = "mimiciv_2.0.zip"
download_directory_to_zip(base_url, zip_filename)

In [None]:
def download_io(
    url: str,
    file: IOBase,
    *,
    session: Optional[Session] = None,
    username: Optional[str] = None,
    password: Optional[str] = None,
    headers: Mapping[str, str] = EMPTY_MAP,
    request_options: Mapping[str, Any] = EMPTY_MAP,
    chunk_size: int = 1024,
) -> None:
    """Download a file from a URL to an IO stream."""
    if session is None:
        # construct the request
        request_options = {
            "headers": headers,
            "auth": None if username is None else (username, password),
            "stream": True,
            "timeout": 10,
        } | request_options
        response = requests.get(url, **request_options)  # type: ignore[arg-type]
    else:
        response = session.get(url)

    if response.status_code != 200:
        raise RuntimeError(
            f"Failed to download {url} with status code {response.status_code}."
        )

    with tqdm(
        desc=f"Downloading {url}",
        total=int(response.headers.get("content-length", 0)),
        unit="iB",
        unit_scale=True,
        unit_divisor=1024,
        leave=False,
    ) as progress_bar:
        for data in response.iter_content(chunk_size=chunk_size):
            if data:  # filter out keep-alive new chunks
                progress_bar.update(chunk_size)
                file.write(data)


def stream_download(
    url: str,
    *,
    session: Optional[Session] = None,
    username: Optional[str] = None,
    password: Optional[str] = None,
    headers: Mapping[str, str] = EMPTY_MAP,
    request_options: Mapping[str, Any] = EMPTY_MAP,
    chunk_size: int = 1024,
) -> Iterator[bytes]:
    """Download a file as a bytes-stream."""
    if session is None:
        # construct the request
        request_options = {
            "headers": headers,
            "auth": None if username is None else (username, password),
            "stream": True,
            "timeout": 10,
        } | request_options
        response = requests.get(url, **request_options)  # type: ignore[arg-type]
    else:
        response = session.get(url)

    if response.status_code != 200:
        raise RuntimeError(
            f"Failed to download {url} with status code {response.status_code}."
        )
    with tqdm(
        desc=f"Downloading {url}",
        total=int(response.headers.get("content-length", 0)),
        unit="iB",
        unit_scale=True,
        unit_divisor=1024,
        leave=False,
    ) as progress_bar:
        for data in response.iter_content(chunk_size=chunk_size):
            if data:  # filter out keep-alive new chunks
                progress_bar.update(chunk_size)
                yield data


from typing import Iterator
from html.parser import HTMLParser
from requests import Session


class LinkParser(HTMLParser):
    def __init__(self, url, session):
        super().__init__()
        self.url = url
        self.session = session
        self.links = []

    def handle_starttag(self, tag, attrs):
        if tag == "a":
            for name, value in attrs:
                if name == "href":
                    self.links.append(value)


def iter_content(url: str, /, *, session: Session) -> Iterator[str]:
    """Iterate over the contents of a directory."""
    response = session.get(url)
    if response.status_code != 200:
        raise RuntimeError(
            f"Failed to download {url} with status code {response.status_code}."
        )

    parser = LinkParser(url, session)
    parser.feed(response.text)

    for link in parser.links:
        if link == "../":
            continue
        elif link.endswith("/"):
            # Recursion
            yield from iter_content(url + link, session=session)
        else:
            # Download non-directory files into the zip file
            yield url + link


# NOTE: Session options as of requests 2.26.0
# __attrs__ = [
#     "headers",
#     "cookies",
#     "auth",
#     "proxies",
#     "hooks",
#     "params",
#     "verify",
#     "cert",
#     "adapters",
#     "stream",
#     "trust_env",
#     "max_redirects",
# ]
def download_directory_to_zip(
    url: str,
    zip_filename: PathLike,
    *,
    # session options
    username: Optional[str] = None,
    password: Optional[str] = None,
    headers: Mapping[str, str] = EMPTY_MAP,
    stream: bool = True,
) -> None:
    """Download a directory from a URL to a zip file."""
    with Session() as session:
        session.auth = (username, password) if username is not None else None
        session.headers = headers
        session.stream = stream

        response = session.get(url)
        if response.status_code != 200:
            raise RuntimeError(
                f"Failed to create session for {url} with status code"
                f" {response.status_code}."
            )

        # Get the contents of the directory
        content = sorted(iter_content(url, session=session))
        print(content)
        # Download the directory
        with ZipFile(zip_filename, "w", compression=ZIP_DEFLATED) as archive:
            for href in (pbar := tqdm(content)):
                # get relative path w.r.t. the base url
                file_name = os.path.relpath(href, url)
                pbar.set_description(f"Downloading {file_name}")
                with archive.open(file_name, "w") as file:
                    download_io(href, file, session=session)

In [None]:
base_url = "https://physionet.org/files/mimiciv/2.0/"
username = getuser()
password = getpass()
headers = {"User-Agent": "Wget/1.21.2"}
zip_filename = "mimic_iv_2.0.zip"

In [None]:
href = "https://physionet.org/files/mimiciv/2.0/hosp/admissions.csv.gz"
os.path.basename(href)

In [None]:
download_directory_to_zip(
    base_url, zip_filename, username=username, password=password, headers=headers
)

In [None]:
response.text

In [None]:
from getpass import getpass

import tsdm
from tsdm.utils.remote import download

In [None]:
ds = tsdm.datasets.MIMIC_III(initialize=False, version="1.2")

In [None]:
ds.download()

In [None]:
ds.HOME_URL

In [None]:
tuple(map(int, "1.3".split("."))) <= (1, 4)

In [None]:
from typing import Literal

In [None]:
x = Literal["a", "b"]

In [None]:
from typing import get_args

In [None]:
get_args(x)

In [None]:
url = ds.BASE_URL.format(version="1.4")
fname = "mimic-iii-clinical-database-1.4.zip"

In [None]:
username = input("MIMIC-III username: ")
password = getpass(prompt="MIMIC-III password: ", stream=None)

In [None]:
headers = {
    "User-Agent": "Wget/1.21.2",
}

In [None]:
download(url, fname, headers=headers, username=username, password=password)

In [None]:
self = MIMIC_III_DeBrouwer2019()

In [None]:
x: [1, 2] = 2

In [None]:
"0".isidentifier()

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(16, 6, figsize=(20, 32), constrained_layout=True, sharey=True)

for col, ax in zip(self.timeseries, axes.flatten()):
    self.timeseries[col].hist(ax=ax, density=True, log=True, bins=20)
    ax.set_ylim(10**-6, 1)

In [None]:
%matplotlib inline

(self.metadata["min"] == 0).mean()

In [None]:
import pandas as pd
from pandas import DataFrame

In [None]:
self.LOGGER.info("Loading main file.")
ts = pd.read_csv(self.rawdata_paths["complete_tensor.csv"], index_col=0)

# Check shape.
if ts.shape != self.rawdata_shapes["complete_tensor.csv"]:
    raise ValueError(
        f"The {ts.shape=} is not correct."
        "Please apply the modified preprocessing using bin_k=2, as outlined in"
        "the appendix. The resulting tensor should have 3082224 rows and 7 columns."
    )

ts = ts.astype(self.rawdata_schemas["complete_tensor.csv"]).sort_values(
    by=["UNIQUE_ID", "TIME_STAMP"]
)

In [None]:
means = ts.groupby("LABEL_CODE").mean()["VALUENUM"].rename("MEANS")
stdvs = ts.groupby("LABEL_CODE").std()["VALUENUM"].rename("STDVS")
stats = (
    DataFrame([means, stdvs])
    .T.reset_index()
    .astype(
        {
            "LABEL_CODE": "int16",
            "MEANS": "float32",
            "STDVS": "float32",
        }
    )
)

In [None]:
ts = (
    ts[["UNIQUE_ID", "TIME_STAMP", "LABEL_CODE", "VALUENUM"]]
    .reset_index(drop=True)
    .set_index(["UNIQUE_ID", "TIME_STAMP"])
    .pivot(columns="LABEL_CODE", values="VALUENUM")
    .sort_index()
    .sort_index(axis=1)
)

In [None]:
ts.describe().T

In [None]:
ts.mean()