In [14]:
from google.colab import drive
drive.mount("/content/drive")

import os
import sys
import tensorflow as tf

PROJECT_DIR = "/content/drive/MyDrive/sepsis-timeline-project"
SRC_DIR = f"{PROJECT_DIR}/src"

os.chdir(PROJECT_DIR)

if SRC_DIR not in sys.path:
    sys.path.insert(0, SRC_DIR)

# üî¥ DENNA RAD √ÑR VIKTIG
FIG_DIR = os.path.join(PROJECT_DIR, "outputs", "figures")
os.makedirs(FIG_DIR, exist_ok=True)

print("‚úÖ Current working directory:", os.getcwd())
print("‚úÖ FIG_DIR:", FIG_DIR)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ Current working directory: /content/drive/MyDrive/sepsis-timeline-project
‚úÖ FIG_DIR: /content/drive/MyDrive/sepsis-timeline-project/outputs/figures


# Preprocessing pipeline for PhysioNet Sepsis time series

This notebook implements the full **data preprocessing pipeline** used in the project.
The goal is to transform raw ICU time series data into clean, normalized, and
model-ready tensors suitable for training sequence models such as LSTM, GRU, and Transformers.

The preprocessing is implemented as a **separate stage from EDA and modeling**
to ensure reproducibility, avoid data leakage, and follow best practices in
machine learning engineering.

---

## Overview of the preprocessing steps

The notebook is structured into clearly defined sections, each corresponding to
a key transformation step in the pipeline:

### 1. Configuration and setup
- Define global preprocessing parameters:
  - number of patients used in the development subset
  - sequence length \(T\) (window size)
  - prediction horizon \(H\)
- Define input/output directories
- Load file lists for reproducible processing

---

### 2. Label construction (future sepsis prediction)
- Construct a **future-oriented label**:
  - At each time step \(t\), the label is 1 if sepsis occurs within the next \(H\) hours
- This formulation reflects the clinical goal of **early sepsis prediction**
  rather than detection at the time of onset

---

### 3. Handling missing data (imputation + masks)
Raw ICU data contains substantial missingness, especially for laboratory variables.
We therefore use a multi-step strategy:

- **Forward-fill imputation** within each patient trajectory
- **Median imputation** using statistics computed on the training set only
- **Explicit mask features** indicating whether each value was observed or missing

This allows the model to distinguish between
‚Äúmeasured low values‚Äù and ‚Äúunmeasured values‚Äù.

---

### 4. Train / validation / test split (patient-level)
- Split is performed at the **patient level**, not at the window level
- This prevents information leakage across splits
- A fixed random seed is used for reproducibility

---

### 5. Normalization using training statistics
- Compute median, mean, and standard deviation using **training data only**
- Apply z-score normalization to all splits
- Defensive checks ensure no NaN or infinite values remain after normalization

---

### 6. Windowing of time series
- Convert variable-length patient trajectories into fixed-length windows:
  - Input: last \(T\) hours of data
  - Output: sepsis label at the final time step
- This produces tensors with shape:
  \[
  (\text{num\_windows}, T, \text{num\_features})
  \]

---

### 7. Dataset export and audit
- Save processed datasets to disk:
  - `train_X.npy`, `train_y.npy`
  - `val_X.npy`, `val_y.npy`
  - `test_X.npy`, `test_y.npy`
- Save metadata (parameters, feature list, shapes) to `meta.json`
- Run a final audit to verify:
  - correct tensor shapes
  - absence of NaN values
  - reasonable class balance

---

## Output of this notebook

The output of this notebook is a **fully processed dataset** stored in
`data/processed/`, ready to be used directly by Keras/TensorFlow models
in subsequent notebooks.

This modular separation of preprocessing, modeling, and evaluation
improves clarity, debuggability, and reproducibility of the project.


In [15]:
# ============================================
# Cell 2 ‚Äî Konfiguration f√∂r preprocessing (dev5000)
# ============================================

import os
import glob
import numpy as np
import pandas as pd

# Var finns r√•data?
RAW_DIR = os.path.join(PROJECT_DIR, "data", "raw", "physionet2019", "training_setA")

# H√§r sparar vi processad data
PROCESSED_DIR = os.path.join(PROJECT_DIR, "data", "processed", "dev5000")
os.makedirs(PROCESSED_DIR, exist_ok=True)

# H√§r sparar vi ev. figurer/loggar
FIG_DIR = os.path.join(PROJECT_DIR, "outputs", "figures")
os.makedirs(FIG_DIR, exist_ok=True)

# Hur m√•nga patienter vill vi anv√§nda?
N_PATIENTS = 5000

# "F√∂nster" och "horisont" (samma som innan om du vill)
T = 48   # hur m√•nga timmar i input-f√∂nstret
H = 6    # hur l√•ngt fram vi f√∂rs√∂ker prediktera "framtida sepsis"

# Reproducerbar sampling (om vi vill slumpa)
SEED = 42

print("‚úÖ RAW_DIR:", RAW_DIR)
print("‚úÖ PROCESSED_DIR:", PROCESSED_DIR)
print("‚úÖ N_PATIENTS:", N_PATIENTS, "| T:", T, "| H:", H)


‚úÖ RAW_DIR: /content/drive/MyDrive/sepsis-timeline-project/data/raw/physionet2019/training_setA
‚úÖ PROCESSED_DIR: /content/drive/MyDrive/sepsis-timeline-project/data/processed/dev5000
‚úÖ N_PATIENTS: 5000 | T: 48 | H: 6


In [16]:
# ============================================
# Cell 3 ‚Äî V√§lj filer (slumpa N_PATIENTS, reproducerbart)
# ============================================

all_files = sorted(glob.glob(os.path.join(RAW_DIR, "p*.psv")))
print("Totalt antal filer i training_setA:", len(all_files))

rng = np.random.default_rng(SEED)
files = rng.choice(all_files, size=N_PATIENTS, replace=False)
files = sorted(files)  # valfritt, g√∂r det snyggt/konsekvent

print("‚úÖ Valda filer:", len(files))
print("Exempel:", os.path.basename(files[0]), " ... ", os.path.basename(files[-1]))


Totalt antal filer i training_setA: 5000
‚úÖ Valda filer: 5000
Exempel: p000001.psv  ...  p005000.psv


In [17]:
# ============================================
# Cell 4 ‚Äî Hj√§lpfunktion: l√§s .psv
# ============================================

def read_psv(path: str) -> pd.DataFrame:
    return pd.read_csv(path, sep="|")


In [18]:
# ============================================
# Cell 5 ‚Äî Feature columns (robust)
# ============================================

df0 = read_psv(files[0])
label_col = "SepsisLabel"

# Ta bort label, och alla icke-feature om du vill (ICULOS/HospAdmTime kan vara feature om du haft dem innan)
feature_cols = [c for c in df0.columns if c != label_col]

print("‚úÖ Antal features:", len(feature_cols))
print("F√∂rsta 10:", feature_cols[:10])


‚úÖ Antal features: 40
F√∂rsta 10: ['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2', 'BaseExcess', 'HCO3']


In [19]:
# ============================================
# Cell 6 ‚Äî Patient-split (train/val/test)
# ============================================

from sklearn.model_selection import train_test_split

train_files, tmp_files = train_test_split(files, test_size=0.3, random_state=SEED)
val_files, test_files  = train_test_split(tmp_files, test_size=0.5, random_state=SEED)

print("‚úÖ train:", len(train_files))
print("‚úÖ val  :", len(val_files))
print("‚úÖ test :", len(test_files))


‚úÖ train: 3500
‚úÖ val  : 750
‚úÖ test : 750


In [20]:
# =====================================================
# Cell 7 ‚Äî Imputering och mask per patient
# =====================================================
# Strategi:
# 1) Forward-fill inom varje patient
# 2) Median-imputering (fr√•n TRAIN)
# 3) Skapa mask-kanaler (1 = observerat, 0 = saknat)
#
# Output:
#   X_norm  -> normaliserade features
#   X_mask  -> mask-kanaler (samma form)
#   X_full  -> concat(X_norm, X_mask)

def transform_patient(df, feature_cols, train_median, train_mean, train_std, H):
    # Extrahera feature-matris
    X = df[feature_cols].copy()

    # Mask: 1 om v√§rdet finns, 0 om NaN
    X_mask = (~X.isna()).astype(np.float32).values

    # Forward-fill l√§ngs tidsaxeln
    X_ffill = X.ffill().values

    # Median-imputering (fallback om aldrig observerat)
    X_filled = np.where(
        np.isnan(X_ffill),
        train_median[None, :],
        X_ffill
    )

    # Normalisering (z-score)
    X_norm = (X_filled - train_mean[None, :]) / train_std[None, :]

    # S√§kerhetssteg: inga NaN / inf f√•r finnas
    X_norm = np.nan_to_num(X_norm, nan=0.0, posinf=0.0, neginf=0.0)

    # Slutlig feature-tensor
    X_full = np.concatenate([X_norm, X_mask], axis=1)

    # Framtidslabel: sepsis inom H timmar
    if "SepsisLabel" in df.columns:
        y = df["SepsisLabel"].fillna(0).values
        y_future = np.zeros_like(y)
        for t in range(len(y)):
            y_future[t] = int(np.any(y[t+1:t+1+H]))
    else:
        y_future = np.zeros(len(df), dtype=np.int64)

    return X_full.astype(np.float32), y_future.astype(np.int64)


In [21]:
# =====================================================
# Cell 8 ‚Äî F√∂nsterindelning (windowing)
# =====================================================
# Vi skapar sekvenser med l√§ngd T.
# Label tas fr√•n sista tidpunkten i varje f√∂nster.

def make_windows(X, y, T):
    Xw, yw = [], []
    for t in range(T - 1, len(X)):
        Xw.append(X[t - T + 1 : t + 1])
        yw.append(y[t])
    return np.array(Xw), np.array(yw)


In [22]:
# =====================================================
# Cell 9 ‚Äî Ber√§kna normaliseringsstatistik p√• TRAIN
# =====================================================
# Viktigt: statistik ber√§knas ENDAST p√• train_files
# f√∂r att undvika datal√§ckage.

X_collect = []

for i, fp in enumerate(train_files):
    df = read_psv(fp)
    X = df[feature_cols].copy().ffill().values
    X_collect.append(X)

    if (i + 1) % 500 == 0:
        print(f"Progress train stats: {i+1}/{len(train_files)}")

X_all = np.vstack(X_collect)

train_median = np.nanmedian(X_all, axis=0)
train_mean   = np.nanmean(X_all, axis=0)
train_std    = np.nanstd(X_all, axis=0)

# S√§kerhetsfix
train_median = np.where(np.isnan(train_median), 0.0, train_median)
train_mean   = np.where(np.isnan(train_mean), 0.0, train_mean)
train_std    = np.where(train_std < 1e-6, 1.0, train_std)

print("‚úÖ Train-statistik klar")
print("Features:", len(feature_cols))


Progress train stats: 500/3500
Progress train stats: 1000/3500
Progress train stats: 1500/3500
Progress train stats: 2000/3500
Progress train stats: 2500/3500
Progress train stats: 3000/3500
Progress train stats: 3500/3500


  return fnb._ureduce(a, func=_nanmedian, keepdims=keepdims,
  train_mean   = np.nanmean(X_all, axis=0)


‚úÖ Train-statistik klar
Features: 40


  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


In [23]:
# =====================================================
# Cell 10 ‚Äî Bygg dataset f√∂r varje split
# =====================================================

def build_split_dataset(file_list, split_name):
    X_all, y_all = [], []

    for i, fp in enumerate(file_list):
        df = read_psv(fp)
        X_full, y_future = transform_patient(
            df, feature_cols,
            train_median, train_mean, train_std,
            H
        )
        Xw, yw = make_windows(X_full, y_future, T)

        if len(Xw) > 0:
            X_all.append(Xw)
            y_all.append(yw)

        if (i + 1) % 500 == 0:
            print(f"{split_name}: {i+1}/{len(file_list)} patienter")

    if len(X_all) == 0:
        return np.zeros((0, T, 2*len(feature_cols))), np.zeros((0,))

    X_out = np.concatenate(X_all, axis=0)
    y_out = np.concatenate(y_all, axis=0)

    print(f"{split_name}: X={X_out.shape}, y={y_out.shape}, pos_rate={y_out.mean():.4f}")
    return X_out, y_out


train_X, train_y = build_split_dataset(train_files, "TRAIN")
val_X,   val_y   = build_split_dataset(val_files,   "VAL")
test_X,  test_y  = build_split_dataset(test_files,  "TEST")


TRAIN: 500/3500 patienter
TRAIN: 1000/3500 patienter
TRAIN: 1500/3500 patienter
TRAIN: 2000/3500 patienter
TRAIN: 2500/3500 patienter
TRAIN: 3000/3500 patienter
TRAIN: 3500/3500 patienter
TRAIN: X=(12825, 48, 80), y=(12825,), pos_rate=0.1352
VAL: 500/750 patienter
VAL: X=(2893, 48, 80), y=(2893,), pos_rate=0.1196
TEST: 500/750 patienter
TEST: X=(2878, 48, 80), y=(2878,), pos_rate=0.1296


In [24]:
# =====================================================
# Cell 11 ‚Äî Spara dataset till disk (dev5000)
# =====================================================

np.save(os.path.join(PROCESSED_DIR, "train_X.npy"), train_X)
np.save(os.path.join(PROCESSED_DIR, "train_y.npy"), train_y)

np.save(os.path.join(PROCESSED_DIR, "val_X.npy"), val_X)
np.save(os.path.join(PROCESSED_DIR, "val_y.npy"), val_y)

np.save(os.path.join(PROCESSED_DIR, "test_X.npy"), test_X)
np.save(os.path.join(PROCESSED_DIR, "test_y.npy"), test_y)

meta = {
    "tag": "dev5000",
    "seed": SEED,
    "N_PATIENTS": N_PATIENTS,
    "T": T,
    "H": H,
    "n_features_raw": len(feature_cols),
    "n_features_model": int(2 * len(feature_cols)),
    "feature_cols": feature_cols,
    "train_shape": list(train_X.shape),
    "val_shape": list(val_X.shape),
    "test_shape": list(test_X.shape),
    "train_pos_rate": float(train_y.mean()),
    "val_pos_rate": float(val_y.mean()),
    "test_pos_rate": float(test_y.mean())
}

with open(os.path.join(PROCESSED_DIR, "meta.json"), "w") as f:
    json.dump(meta, f, indent=2)

print("‚úÖ Sparat dataset till:", PROCESSED_DIR)


‚úÖ Sparat dataset till: /content/drive/MyDrive/sepsis-timeline-project/data/processed/dev5000


In [25]:
# =====================================================
# Cell 12 ‚Äî Mini-audit (sanity check)
# =====================================================

def audit_split(X, y, name):
    print(f"\n--- {name} ---")
    print("Shape X:", X.shape, "| y:", y.shape)
    print("NaN i X:", np.isnan(X).any())
    print("Positives:", int(y.sum()), "| Pos rate:", y.mean())

audit_split(train_X, train_y, "TRAIN")
audit_split(val_X,   val_y,   "VAL")
audit_split(test_X,  test_y,  "TEST")



--- TRAIN ---
Shape X: (12825, 48, 80) | y: (12825,)
NaN i X: False
Positives: 1734 | Pos rate: 0.1352046783625731

--- VAL ---
Shape X: (2893, 48, 80) | y: (2893,)
NaN i X: False
Positives: 346 | Pos rate: 0.11959903214656066

--- TEST ---
Shape X: (2878, 48, 80) | y: (2878,)
NaN i X: False
Positives: 373 | Pos rate: 0.1296038915913829
