In [None]:
!pip install vitaldb boto3 pandas numpy tqdm pyarrow
import os
import pandas as pd
import numpy as np
import boto3
from botocore import UNSIGNED
from botocore.client import Config
from tqdm.notebook import tqdm
from google.colab import drive
import vitaldb
import glob

Collecting vitaldb
  Downloading vitaldb-1.5.6-py3-none-any.whl.metadata (314 bytes)
Collecting boto3
  Downloading boto3-1.39.17-py3-none-any.whl.metadata (6.7 kB)
Collecting wfdb (from vitaldb)
  Downloading wfdb-4.3.0-py3-none-any.whl.metadata (3.8 kB)
Collecting botocore<1.40.0,>=1.39.17 (from boto3)
  Downloading botocore-1.39.17-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.14.0,>=0.13.0 (from boto3)
  Downloading s3transfer-0.13.1-py3-none-any.whl.metadata (1.7 kB)
Collecting pandas
  Downloading pandas-2.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Downloading vitaldb-1.5.6-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.9/59.9 kB[0m [31m3.8 MB/s[0m eta [36m0

In [None]:
drive.mount('/content/drive')

DRIVE_PROJECT_ROOT = '/content/drive/MyDrive/VitalDB_Preprocesseddd'
os.makedirs(DRIVE_PROJECT_ROOT, exist_ok=True)

DRIVE_BATCH_OUTPUT_DIR = os.path.join(DRIVE_PROJECT_ROOT, 'preprocessed_batch')
os.makedirs(DRIVE_BATCH_OUTPUT_DIR, exist_ok=True)

VITAL_FILES_LOCAL_DIR = '/content/all_vital_files'
os.makedirs(VITAL_FILES_LOCAL_DIR, exist_ok=True)

Mounted at /content/drive


In [None]:
TOTAL_PATIENTS = 6388
BATCH_SIZE = 100

In [None]:
import warnings

print("--- Starting Batched Preprocessing ---")

s3_client = boto3.client('s3', config=Config(signature_version=UNSIGNED))
S3_BUCKET_NAME, S3_BASE_KEY = 'physionet-open', 'vitaldb/1.0.0/vital_files/'

existing_batch_files = glob.glob(os.path.join(DRIVE_BATCH_OUTPUT_DIR, 'batch_*.parquet'))
processed_ids = set()
for f in existing_batch_files:
    parts = os.path.basename(f).replace('.parquet', '').split('_')[1].split('-')
    start, end = int(parts[0]), int(parts[1])
    processed_ids.update(range(start, end + 1))

print(f"Found {len(existing_batch_files)} existing batch files, covering {len(processed_ids)} patients.")

with warnings.catch_warnings():
    warnings.simplefilter("ignore", FutureWarning)

    # You can adjust the range (START, END, STEP)
    for batch_start_id in range(1, TOTAL_PATIENTS + 1, BATCH_SIZE):
        batch_end_id = min(batch_start_id + BATCH_SIZE - 1, TOTAL_PATIENTS)

        if batch_start_id in processed_ids and batch_end_id in processed_ids:
            print(f"--- Batch {batch_start_id}-{batch_end_id} already processed. Skipping. ---")
            continue

        print(f"\n--- Starting new batch: Patients {batch_start_id} to {batch_end_id} ---")

        ids_to_process_this_batch = list(range(batch_start_id, batch_end_id + 1))

        for patient_id in tqdm(ids_to_process_this_batch, desc=f"Downloading files for batch {batch_start_id}-{batch_end_id}"):
            file_name = f'{patient_id:04d}.vital'
            local_file_path = os.path.join(VITAL_FILES_LOCAL_DIR, file_name)
            if not os.path.exists(local_file_path):
                try:
                    s3_client.download_file(S3_BUCKET_NAME, os.path.join(S3_BASE_KEY, file_name), local_file_path)
                except Exception as e:
                    print(f"\nERROR downloading {file_name}: {e}")

        batch_dfs = []
        for patient_id in tqdm(ids_to_process_this_batch, desc=f"Processing files for batch {batch_start_id}-{batch_end_id}"):
            file_path = os.path.join(VITAL_FILES_LOCAL_DIR, f'{patient_id:04d}.vital')
            if not os.path.exists(file_path):
                continue
            try:
                vf = vitaldb.VitalFile(file_path)
                track_names = vf.get_track_names()
                df = vf.to_pandas(track_names, interval=1)

                df = df.ffill().infer_objects(copy=False)

                df.dropna(how='all', inplace=True)

                if not df.empty:
                    df['patient_id'] = patient_id
                    df.reset_index(inplace=True)
                    df.rename(columns={'index': 'Time'}, inplace=True)
                    batch_dfs.append(df)
            except Exception as e:
                print(f"\nERROR processing file {patient_id:04d}: {e}")

        if batch_dfs:
            batch_df = pd.concat(batch_dfs, ignore_index=True)
            output_filename = f"batch_{batch_start_id:04d}-{batch_end_id:04d}.parquet"
            output_path = os.path.join(DRIVE_BATCH_OUTPUT_DIR, output_filename)

            batch_df.to_parquet(output_path, engine='pyarrow')
            print(f"--- Successfully saved batch to: {output_path} ---")

        for patient_id in ids_to_process_this_batch:
            try:
                os.remove(os.path.join(VITAL_FILES_LOCAL_DIR, f'{patient_id:04d}.vital'))
            except OSError:
                pass
        print("--- Cleaned up local files for this batch. ---")

print("\n--- All Batches Processed ---")

--- Starting Batched Preprocessing ---
Found 4 existing batch files, covering 40 patients.
--- Batch 1-10 already processed. Skipping. ---
--- Batch 11-20 already processed. Skipping. ---
--- Batch 21-30 already processed. Skipping. ---
--- Batch 31-40 already processed. Skipping. ---

--- Starting new batch: Patients 41 to 50 ---


Downloading files for batch 41-50:   0%|          | 0/10 [00:00<?, ?it/s]

Processing files for batch 41-50:   0%|          | 0/10 [00:00<?, ?it/s]

--- Successfully saved batch to: /content/drive/MyDrive/VitalDB_Preprocesseddd/preprocessed_batch/batch_0041-0050.parquet ---
--- Cleaned up local files for this batch. ---

--- All Batches Processed ---
