In [1]:
import sys
print(sys.executable)


/Users/punarnava/los-europe/.venv/bin/python


In [2]:
from pathlib import Path
import pandas as pd
import requests

PROJECT_ROOT = Path("..").resolve()  # notebook lives in /notebooks
RAW_DIR = PROJECT_ROOT / "data" / "raw"
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"

RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

pd.set_option("display.max_columns", 200)


In [3]:
def eurostat_get(dataset: str, params=None, timeout: int = 60) -> dict:
    url = f"https://ec.europa.eu/eurostat/api/dissemination/statistics/1.0/data/{dataset}"
    r = requests.get(url, params=params or {}, timeout=timeout)
    r.raise_for_status()
    return r.json()


In [4]:
def eurostat_json_to_df(js: dict) -> pd.DataFrame:
    dim_order = js["id"]
    dims = js["dimension"]

    dim_codes = []
    for d in dim_order:
        idx_map = dims[d]["category"]["index"]  # code -> position
        codes_sorted = sorted(idx_map.items(), key=lambda x: x[1])
        dim_codes.append([c for c, _ in codes_sorted])

    values = js.get("value", {})
    if not values:
        return pd.DataFrame()

    shape = [len(codes) for codes in dim_codes]

    rows = []
    for lin_idx_str, val in values.items():
        lin_idx = int(lin_idx_str)
        coords = []
        rem = lin_idx
        for size in reversed(shape):
            coords.append(rem % size)
            rem //= size
        coords = list(reversed(coords))

        row = {dim_order[i]: dim_codes[i][coords[i]] for i in range(len(dim_order))}
        row["value"] = val
        rows.append(row)

    return pd.DataFrame(rows)


In [5]:
DATASET = "hlth_co_dischls"  # Eurostat health family that includes LOS measures

js = eurostat_get(DATASET)
df_raw = eurostat_json_to_df(js)

print(df_raw.shape)
df_raw.head()


(5202, 7)


Unnamed: 0,freq,icha_hc,indic_he,unit,geo,time,value
0,A,HC1,ALOS,NR,AT,1989,11.01
1,A,HC1,ALOS,NR,AT,1990,11.12
2,A,HC1,ALOS,NR,AT,1991,10.5
3,A,HC1,ALOS,NR,AT,1992,10.15
4,A,HC1,ALOS,NR,AT,1993,9.96


In [6]:
raw_path = RAW_DIR / f"{DATASET}_raw.parquet"
df_raw.to_parquet(raw_path, index=False)
raw_path


PosixPath('/Users/punarnava/los-europe/data/raw/hlth_co_dischls_raw.parquet')

In [7]:
df = df_raw.copy()
df.columns = [c.lower() for c in df.columns]

# Basic fields
if "time" not in df.columns:
    raise ValueError(f"No 'time' column found. Columns: {df.columns.tolist()}")

df["year"] = pd.to_numeric(df["time"], errors="coerce")
df["los_days"] = pd.to_numeric(df["value"], errors="coerce")

# Country column usually geo
geo_col = "geo" if "geo" in df.columns else next((c for c in df.columns if "geo" in c), None)
if geo_col is None:
    raise ValueError(f"No geography column found. Columns: {df.columns.tolist()}")

df.rename(columns={geo_col: "country_code"}, inplace=True)

# Drop missings + choose year window
df = df.dropna(subset=["year", "los_days", "country_code"])
df = df[df["year"].between(2010, 2023)]

# If dataset includes multiple stratifiers, keep them for now
keep = ["country_code", "year", "los_days"]
extra = [c for c in ["unit", "sex", "age", "icd10", "diagnosis", "care", "hosp"] if c in df.columns]

df_clean = df[keep + extra].sort_values(["year", "country_code"]).reset_index(drop=True)

df_clean.head(), df_clean.shape


(  country_code  year     los_days unit
 0           AL  2010         5.80   NR
 1           AL  2010    255024.00   NR
 2           AT  2010         6.61   NR
 3           AT  2010  14474030.00   NR
 4           AT  2010   2189708.00   NR,
 (1647, 4))

In [8]:
df = df_raw.copy()
df.columns = [c.lower() for c in df.columns]

# Explicit slice = reproducible methods
df = df[
    (df["freq"] == "A") &
    (df["icha_hc"] == "HC1") &
    (df["indic_he"] == "ALOS")
].copy()

# Convert
df["year"] = pd.to_numeric(df["time"], errors="coerce")
df["los_days"] = pd.to_numeric(df["value"], errors="coerce")
df.rename(columns={"geo": "country_code"}, inplace=True)

# Drop missing + choose window
df = df.dropna(subset=["year", "los_days", "country_code"])
df = df[df["year"].between(2010, 2023)]

# Final study dataset columns (keep codes for methods transparency)
df_clean = df[["country_code", "year", "los_days", "icha_hc", "indic_he", "unit"]].sort_values(
    ["year", "country_code"]
).reset_index(drop=True)

df_clean.head(), df_clean.shape


(  country_code  year  los_days icha_hc indic_he unit
 0           AT  2010      6.61     HC1     ALOS   NR
 1           BE  2010      7.20     HC1     ALOS   NR
 2           CH  2010      7.85     HC1     ALOS   NR
 3           CY  2010      5.40     HC1     ALOS   NR
 4           CZ  2010      6.57     HC1     ALOS   NR,
 (333, 6))