In [6]:

import polars as pl
from pathlib import Path

In [8]:
demo_folder = Path("../icu_demo_data/aki/mimic_demo/")
dyn = pl.read_parquet(demo_folder / "dyn.parquet")

In [9]:
dyn.schema

Schema([('stay_id', Int32),
        ('time', Duration(time_unit='ms')),
        ('alb', Float64),
        ('alp', Float64),
        ('alt', Float64),
        ('ast', Float64),
        ('be', Float64),
        ('bicar', Float64),
        ('bili', Float64),
        ('bili_dir', Float64),
        ('bnd', Float64),
        ('bun', Float64),
        ('ca', Float64),
        ('cai', Float64),
        ('ck', Float64),
        ('ckmb', Float64),
        ('cl', Float64),
        ('crea', Float64),
        ('crp', Float64),
        ('dbp', Float64),
        ('fgn', Float64),
        ('fio2', Float64),
        ('glu', Float64),
        ('hgb', Float64),
        ('hr', Float64),
        ('inr_pt', Float64),
        ('k', Float64),
        ('lact', Float64),
        ('lymph', Float64),
        ('map', Float64),
        ('mch', Float64),
        ('mchc', Float64),
        ('mcv', Float64),
        ('methb', Float64),
        ('mg', Float64),
        ('na', Float64),
        ('neut', Float64),
      

In [None]:
Schema([('stay_id', Int32),
        ('time', Duration(time_unit='ms')),
        ('alb', Float64),
        ('alp', Float64),
        ('alt', Float64),
        ('ast', Float64),
        ('be', Float64),
        ('bicar', Float64),
        ('bili', Float64),
        ('bili_dir', Float64),
        ('bnd', Float64),
        ('bun', Float64),
        ('ca', Float64),
        ('cai', Float64),
        ('ck', Float64),
        ('ckmb', Float64),
        ('cl', Float64),
        ('crea', Float64),
        ('crp', Float64),
        ('dbp', Float64),
        ('fgn', Float64),
        ('fio2', Float64),
        ('glu', Float64),
        ('hgb', Float64),
        ('hr', Float64),
        ('inr_pt', Float64),
        ('k', Float64),
        ('lact', Float64),
        ('lymph', Float64),
        ('map', Float64),
        ('mch', Float64),
        ('mchc', Float64),
        ('mcv', Float64),
        ('methb', Float64),
        ('mg', Float64),
        ('na', Float64),
        ('neut', Float64),
        ('o2sat', Float64),
        ('pco2', Float64),
        ('ph', Float64),
        ('phos', Float64),
        ('plt', Float64),
        ('po2', Float64),
        ('ptt', Float64),
        ('resp', Float64),
        ('sbp', Float64),
        ('temp', Float64),
        ('tnt', Float64),
        ('urine', Float64),
        ('wbc', Float64)])

In [39]:
import polars as pl
import numpy as np
from datetime import datetime, timedelta
import random

def generate_data(stay_ids: list[int], max_rows: int, start_time: datetime, timestep: timedelta, columns: list[str], missingness: dict[str, float]) -> pl.DataFrame:
    """
    Generate a Polars DataFrame with the specified number of rows and schema for multiple stay IDs.

    Parameters:
    - stay_ids: The list of stay IDs to use.
    - max_rows: The maximum number of rows to generate.
    - start_time: The starting time for the time column.
    - timestep: The time interval between rows.
    - columns: A list of column names to generate data for.
    - missingness: A dictionary where keys are column names and values are the percentage of missing values.

    Returns:
    - A Polars DataFrame with the generated data.
    """
    # Distribute rows randomly among stay IDs
    rows_per_stay_id = np.random.multinomial(max_rows, [1/len(stay_ids)]*len(stay_ids))

    all_data = []

    for stay_id, num_rows in zip(stay_ids, rows_per_stay_id):
        # Generate the time column
        time_column = [start_time + i * timestep for i in range(num_rows)]

        # Initialize the data dictionary with the time and stay_id columns
        data = {
            "stay_id": [stay_id] * num_rows,
            "time": time_column
        }

        # Generate random data for other columns
        for column in columns:
            col_data = np.random.rand(num_rows)
            # Introduce missingness
            if column in missingness:
                missing_count = int(num_rows * missingness[column])
                missing_indices = np.random.choice(num_rows, missing_count, replace=False)
                col_data[missing_indices] = np.nan
            data[column] = col_data

        # Create the DataFrame for the current stay_id
        df = pl.DataFrame(data)
        all_data.append(df)

    # Concatenate all DataFrames
    final_df = pl.concat(all_data)

    return final_df

# Example usage
max_rows = 100000
stay_ids = random.sample(range(1, 10000), int(max_rows/24))
start_time = datetime(2024, 1, 1)
timestep = timedelta(hours=1)

# Define the list of column names
columns = ["alb", "alp", "alt", "ast", "be", "bicar", "bili", "bili_dir", "bnd", "bun", "ca", "cai", "ck", "ckmb", "cl", "crea", "crp", "dbp", "fgn", "fio2", "glu", "hgb", "hr", "inr_pt", "k", "lact", "lymph", "map", "mch", "mchc", "mcv", "methb", "mg", "na", "neut", "o2sat", "pco2", "ph", "phos", "plt", "po2", "ptt", "resp", "sbp", "temp", "tnt", "urine", "wbc"]

# Define the missingness dictionary
missingness = {item:random.uniform(0, 0.99) for item in columns}

generated_df = generate_data(stay_ids, max_rows, start_time, timestep, columns, missingness)

In [40]:
generated_df

stay_id,time,alb,alp,alt,ast,be,bicar,bili,bili_dir,bnd,bun,ca,cai,ck,ckmb,cl,crea,crp,dbp,fgn,fio2,glu,hgb,hr,inr_pt,k,lact,lymph,map,mch,mchc,mcv,methb,mg,na,neut,o2sat,pco2,ph,phos,plt,po2,ptt,resp,sbp,temp,tnt,urine,wbc
i64,datetime[μs],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
7201,2024-01-01 00:00:00,,0.253989,,,0.343446,0.062044,,0.642601,,,0.238478,0.818324,0.745067,0.362134,,,0.810509,,0.750668,0.772734,0.333509,,0.213528,,0.254461,0.698646,0.332606,0.286207,,0.480025,0.503376,0.09808,,0.691521,0.380763,0.541645,,0.333954,0.393055,,,0.482675,,,,,0.108756,
7201,2024-01-01 01:00:00,0.768926,0.213239,0.152267,0.042311,0.309759,,0.926417,0.106144,0.498812,,0.713818,0.672558,0.277455,0.457279,,0.899289,,,,0.82462,,,,,0.155954,0.024444,0.419664,0.009758,,0.305064,,0.247919,,0.23277,0.477596,,0.100539,0.303139,0.439882,,,,0.619773,,0.452434,,0.991794,
7201,2024-01-01 02:00:00,0.276351,0.349534,,0.702336,0.398705,,,0.813082,0.422173,,0.169122,0.715041,0.526684,0.038635,,0.75354,,,,0.610522,,,,,0.031457,0.212417,,,0.508623,0.809326,,,,0.34529,0.668088,,,,0.477014,,,0.152352,,,,,0.084913,
7201,2024-01-01 03:00:00,0.733373,0.670427,,0.375782,0.368085,,0.269464,,,,,0.35851,0.152265,0.928841,,0.290217,,,,0.499142,,,,,0.343685,0.59537,0.682881,0.75407,0.776421,0.492791,,,,0.163325,0.029648,,0.57339,,0.971492,,,0.346508,,0.071894,,,0.568799,
7201,2024-01-01 04:00:00,0.948863,0.479664,,,0.475121,,,0.259101,0.889947,0.57615,0.539198,,0.849404,0.265578,,,0.745603,,,0.762885,,,,,0.316656,0.091905,0.096026,0.851654,0.125785,0.310958,,,,0.521121,0.045947,,0.324674,0.388788,0.147576,,0.940269,0.853576,0.690004,,,0.840452,0.958739,0.874881
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
5752,2024-01-01 12:00:00,0.230622,0.604726,,0.099939,0.509189,,,0.870416,0.040802,0.263998,0.077954,0.127428,0.511141,0.047803,,,,,,0.050114,,,,,0.823463,0.180632,0.553115,0.330465,,0.851108,,0.363141,,0.839141,0.418625,0.242803,0.506647,,0.366353,,,,0.98547,,,,0.545084,
5752,2024-01-01 13:00:00,0.561388,0.75278,,,0.156763,0.480414,,0.490241,0.137495,,0.092682,,0.784596,0.883921,,,0.038173,,,0.709661,,,,,0.205482,0.880263,0.782223,0.21414,0.359732,0.119511,,0.982561,,,0.233343,0.133841,0.238903,0.832177,0.290973,,0.558533,0.487455,0.552856,,,,,
5752,2024-01-01 14:00:00,0.338405,0.083771,,,0.903486,0.668105,,0.853364,0.255978,,0.675339,0.899852,0.439914,0.199979,,0.507306,,0.119532,,0.970958,,0.995487,,,0.792831,0.471849,0.594931,0.476527,,0.899054,,,,0.346871,0.733596,,,0.806138,0.733992,,,,0.877479,,,0.978479,0.129108,
5752,2024-01-01 15:00:00,0.852653,0.605916,0.789,,0.917541,0.991582,,0.670737,0.574423,,0.044239,0.417545,0.698224,0.503783,,0.432786,0.149825,,,0.085591,,,0.790744,,0.84166,0.713072,0.157752,,,0.980502,,,,0.911827,0.559503,,0.623413,,0.811895,,,0.847213,,,0.890129,,0.61309,


In [33]:
missingness

{'alb': 0.25010127125696996,
 'alp': 0.5102602076878041,
 'alt': 0.30452326742214636,
 'ast': 0.14924902976492482,
 'be': 0.4027434607113478,
 'bicar': 0.2868813414946926,
 'bili': 0.5902132411773542,
 'bili_dir': 0.5763916533931818,
 'bnd': 0.48467366314581356,
 'bun': 0.9029655555280026,
 'ca': 0.2796052527184514,
 'cai': 0.1724840290557209,
 'ck': 0.8526586116890228,
 'ckmb': 0.6037706247321472,
 'cl': 0.8187203129372831,
 'crea': 0.7388299100875225,
 'crp': 0.004227931164244445,
 'dbp': 0.951370629023908,
 'fgn': 0.736975358218586,
 'fio2': 0.8068815555077843,
 'glu': 0.4156104615593092,
 'hgb': 0.7524756871376157,
 'hr': 0.06430518681778655,
 'inr_pt': 0.847477453257622,
 'k': 0.3568237015640199,
 'lact': 0.9351164496564076,
 'lymph': 0.39060104567205056,
 'map': 0.27839869524719696,
 'mch': 0.9490364996264921,
 'mchc': 0.5583716772936833,
 'mcv': 0.573392986050853,
 'methb': 0.6214581944706018,
 'mg': 0.5121962922322404,
 'na': 0.9381947173827705,
 'neut': 0.7645619101954946,
 'o