# EUROSTAT CHDD Data Preprocessing for Italy

This notebook reshapes the EUROSTAT monthly Cooling and Heating Degree Days (CHDD) TSV
into small, analysis-friendly JSON files focused on Italy (`geo == "IT"`).

What you will get:
1. Cooling degree days (CDD): yearly totals and average seasonality by month.
2. Heating degree days (HDD): yearly totals, average seasonality by month, and
   per-(year, month) values across the full time range.

High-level flow:
1. Load and lightly normalize the EUROSTAT TSV.
2. Extract Italy-only series for CDD and HDD.
3. Aggregate into the shapes needed by downstream charts.
4. Save JSON outputs and verify them.


In [1]:
# --- Imports and configuration -------------------------------------------------
import json
from pathlib import Path

import pandas as pd

# Input data (downloaded EUROSTAT TSV)
DATA_PATH = Path("estat_nrg_chdd_m.tsv")

# We will export a small set of JSON files in the working directory.
OUTPUT_FILES = {
    "cdd_year": Path("cdd_italy_by_year.json"),
    "cdd_month": Path("cdd_italy_by_month.json"),
    "hdd_year": Path("hdd_italy_by_year.json"),
    "hdd_month": Path("hdd_italy_by_month.json"),
    "hdd_avg_month_by_year": Path("energy_avg_per_month_through_years.json"),
    "hdd_monthly_through_years": Path("hdd_monthly_through_years.json"),
}

TARGET_GEO = "IT"
INDICATORS = ("CDD", "HDD")

pd.set_option("display.max_rows", 20)
pd.set_option("display.max_columns", 12)


In [2]:
# --- Load TSV and define helpers ----------------------------------------------
print(f"Loading data from {DATA_PATH} ...")

df_raw = pd.read_csv(DATA_PATH, sep="\t", encoding="utf-8")
print(f"Loaded {len(df_raw):,} rows and {len(df_raw.columns)} columns")

# EUROSTAT packs multiple dimensions into the first column, e.g.:
#   "M,DEG_C,CDD,IT"
# We split it into separate columns for easier filtering.
dims_col = df_raw.columns[0]
df_raw[["freq", "unit", "indic_nrg", "geo"]] = df_raw[dims_col].str.split(",", expand=True
)

# Time columns are everything except the dimension columns.
DIMENSION_COLS = {dims_col, "freq", "unit", "indic_nrg", "geo"}
time_cols = [c for c in df_raw.columns if c not in DIMENSION_COLS]
print(f"Detected {len(time_cols)} monthly time columns")


def _coerce_value(value):
    """Convert EUROSTAT cell values to float when possible.

    EUROSTAT values sometimes include flags or extra whitespace. We keep the
    parsing logic in one place to make behavior explicit and easy to adjust.
    """
    if pd.isna(value):
        return None

    text = str(value).strip()
    if text == "":
        return None

    try:
        return float(text)
    except ValueError:
        # If a flag/annotation appears (e.g. "12.3 e"), we could add a more
        # advanced parser here. For now we skip non-numeric values.
        return None


def load_indic(df_source: pd.DataFrame, indic: str, geo: str = TARGET_GEO) -> pd.DataFrame:
    """Return a tidy monthly series for a given indicator and geography.

    Output columns:
    - period: original EUROSTAT period label (YYYY-MM)
    - value: numeric CHDD value
    - date: parsed datetime for sorting and feature extraction
    - year, month: numeric helpers for grouping
    """
    filtered = df_source[(df_source["geo"] == geo) & (df_source["indic_nrg"] == indic)]

    if filtered.empty:
        return pd.DataFrame(columns=["period", "value", "date", "year", "month"])

    records = []
    for _, row in filtered.iterrows():
        for period in time_cols:
            value = _coerce_value(row[period])
            if value is not None:
                records.append({"period": period.strip(), "value": value})

    out = pd.DataFrame(records)
    out["date"] = pd.to_datetime(out["period"], format="%Y-%m", errors="coerce")
    out = out.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)
    out["year"] = out["date"].dt.year.astype(int)
    out["month"] = out["date"].dt.month.astype(int)
    return out


# Load Italy-specific CDD and HDD series.
df_cdd = load_indic(df_raw, "CDD")
df_hdd = load_indic(df_raw, "HDD")

if not df_cdd.empty:
    print(f"CDD: {len(df_cdd):,} records from {df_cdd['date'].min().date()} to {df_cdd['date'].max().date()}")
if not df_hdd.empty:
    print(f"HDD: {len(df_hdd):,} records from {df_hdd['date'].min().date()} to {df_hdd['date'].max().date()}")
if df_cdd.empty and df_hdd.empty:
    print(f"No data found for geo={TARGET_GEO} and indic_nrg in {INDICATORS}")

# Quick peek (useful while iterating; safe to keep here).
df_cdd.head()


Loading data from estat_nrg_chdd_m.tsv ...
Loaded 58 rows and 553 columns
Detected 552 monthly time columns
CDD: 552 records from 1979-01-01 to 2024-12-01
HDD: 552 records from 1979-01-01 to 2024-12-01


Unnamed: 0,period,value,date,year,month
0,1979-01,0.0,1979-01-01,1979,1
1,1979-02,0.0,1979-02-01,1979,2
2,1979-03,0.0,1979-03-01,1979,3
3,1979-04,0.0,1979-04-01,1979,4
4,1979-05,0.33,1979-05-01,1979,5


In [3]:
# --- Basic validation and summaries -------------------------------------------

def describe_series(name: str, df: pd.DataFrame) -> None:
    """Print compact diagnostics for a tidy CHDD series."""
    if df.empty:
        print(f"✗ {name}: no records")
        return

    print(f"✓ {name} loaded")
    print(
        "  - Records: {records:,} | Years: {ymin}–{ymax} | Values: {vmin:.2f}–{vmax:.2f}".format(
            records=len(df),
            ymin=df["year"].min(),
            ymax=df["year"].max(),
            vmin=df["value"].min(),
            vmax=df["value"].max(),
        )
    )


describe_series("CDD", df_cdd)
describe_series("HDD", df_hdd)


✓ CDD loaded
  - Records: 552 | Years: 1979–2024 | Values: 0.00–170.81
✓ HDD loaded
  - Records: 552 | Years: 1979–2024 | Values: 1.54–507.76


In [4]:
# --- Aggregations tailored to downstream charts -------------------------------

def aggregate_cdd(df_cdd: pd.DataFrame):
    """Compute yearly totals and average monthly seasonality for CDD."""
    if df_cdd.empty:
        return None, None

    yearly = (
        df_cdd.groupby("year", as_index=False)["value"].sum().rename(columns={"value": "consumption"})
    )

    monthly = (
        df_cdd.groupby("month", as_index=False)["value"].mean().rename(columns={"value": "consumption"})
    )
    monthly["month"] = monthly["month"].astype(int)

    return yearly, monthly


def aggregate_hdd(df_hdd: pd.DataFrame):
    """Compute the HDD aggregates used by the app/dashboard."""
    if df_hdd.empty:
        return None, None, None, None

    # 1) Yearly totals
    yearly = (
        df_hdd.groupby("year", as_index=False)["value"].sum().rename(columns={"value": "consumption"})
    )
    yearly["consumption"] = yearly["consumption"].round(2)

    # 2) Average seasonality by month (across all years)
    monthly = (
        df_hdd.groupby("month", as_index=False)["value"].mean().rename(columns={"value": "consumption"})
    )
    monthly["month"] = monthly["month"].astype(int)
    monthly["consumption"] = monthly["consumption"].round(4)

    # 3) "Average per month through years" (yearly total / 12)
    avg_monthly_by_year = yearly.copy()
    avg_monthly_by_year["avg_monthly_consumption"] = (avg_monthly_by_year["consumption"] / 12).round(4)

    # 4) Per-(year, month) values across the full range
    monthly_through_years = (
        df_hdd[["year", "month", "value"]]
        .rename(columns={"value": "consumption"})
        .sort_values(["year", "month"], ignore_index=True)
    )
    monthly_through_years["consumption"] = monthly_through_years["consumption"].round(2)

    return yearly, monthly, avg_monthly_by_year, monthly_through_years


yearly_data_cdd, monthly_data_cdd = aggregate_cdd(df_cdd)
(
    yearly_data_hdd,
    monthly_data_hdd,
    avg_monthly_by_year,
    monthly_through_years,
) = aggregate_hdd(df_hdd)

if yearly_data_cdd is not None:
    print(f"✓ CDD aggregated: {len(yearly_data_cdd)} yearly rows, {len(monthly_data_cdd)} monthly rows")
if yearly_data_hdd is not None:
    print(f"✓ HDD aggregated: {len(yearly_data_hdd)} yearly rows, {len(monthly_data_hdd)} monthly rows")
    print(f"  Avg monthly by year: {len(avg_monthly_by_year)} rows")
    print(f"  Monthly through years: {len(monthly_through_years)} rows")

# Optional: quick look at one of the outputs
yearly_data_hdd.head() if yearly_data_hdd is not None else None


✓ CDD aggregated: 46 yearly rows, 12 monthly rows
✓ HDD aggregated: 46 yearly rows, 12 monthly rows
  Avg monthly by year: 46 rows
  Monthly through years: 552 rows


Unnamed: 0,year,consumption
0,1979,2234.84
1,1980,2414.4
2,1981,2268.62
3,1982,2181.12
4,1983,2208.03


In [5]:
# --- Persistence (JSON exports) -----------------------------------------------

def save_json(records, path: Path) -> None:
    """Write a list of dictionaries to JSON with consistent formatting."""
    with path.open("w", encoding="utf-8") as f:
        json.dump(records, f, indent=2, ensure_ascii=False)
    print(f"✓ Saved: {path}")


# CDD outputs
if yearly_data_cdd is not None and monthly_data_cdd is not None:
    save_json(yearly_data_cdd.to_dict("records"), OUTPUT_FILES["cdd_year"])
    save_json(monthly_data_cdd.to_dict("records"), OUTPUT_FILES["cdd_month"])

# HDD outputs
if yearly_data_hdd is not None and monthly_data_hdd is not None:
    save_json(yearly_data_hdd.to_dict("records"), OUTPUT_FILES["hdd_year"])
    save_json(monthly_data_hdd.to_dict("records"), OUTPUT_FILES["hdd_month"])

if avg_monthly_by_year is not None:
    save_json(
        avg_monthly_by_year[["year", "avg_monthly_consumption"]].to_dict("records"),
        OUTPUT_FILES["hdd_avg_month_by_year"],
    )

if monthly_through_years is not None:
    save_json(monthly_through_years.to_dict("records"), OUTPUT_FILES["hdd_monthly_through_years"])


# Compact run summary
print("\nSummary:")
if yearly_data_hdd is not None:
    print(f"  HDD yearly rows: {len(yearly_data_hdd)} | total: {yearly_data_hdd['consumption'].sum():.2f}")
if monthly_data_hdd is not None:
    print(f"  HDD monthly mean across months: {monthly_data_hdd['consumption'].mean():.2f}")
if avg_monthly_by_year is not None:
    print(
        "  HDD average monthly (overall mean): "
        f"{avg_monthly_by_year['avg_monthly_consumption'].mean():.2f}"
    )


✓ Saved: cdd_italy_by_year.json
✓ Saved: cdd_italy_by_month.json
✓ Saved: hdd_italy_by_year.json
✓ Saved: hdd_italy_by_month.json
✓ Saved: energy_avg_per_month_through_years.json
✓ Saved: hdd_monthly_through_years.json

Summary:
  HDD yearly rows: 46 | total: 91589.15
  HDD monthly mean across months: 165.92
  HDD average monthly (overall mean): 165.92


In [6]:
# --- Output verification -------------------------------------------------------

def preview_json(path: Path, n: int = 2) -> None:
    """Show a tiny preview to confirm shape and content."""
    if not path.exists():
        print(f"✗ {path} not found")
        return

    with path.open("r", encoding="utf-8") as f:
        data = json.load(f)

    print(f"\n✓ {path} ({len(data)} records)")
    for record in data[:n]:
        print(f"  {record}")
    if len(data) > n:
        print("  ...")


for key in OUTPUT_FILES:
    preview_json(OUTPUT_FILES[key])



✓ cdd_italy_by_year.json (46 records)
  {'year': 1979, 'consumption': 75.6}
  {'year': 1980, 'consumption': 62.52}
  ...

✓ cdd_italy_by_month.json (12 records)
  {'month': 1, 'consumption': 0.0}
  {'month': 2, 'consumption': 0.0002173913043478261}
  ...

✓ hdd_italy_by_year.json (46 records)
  {'year': 1979, 'consumption': 2234.84}
  {'year': 1980, 'consumption': 2414.4}
  ...

✓ hdd_italy_by_month.json (12 records)
  {'month': 1, 'consumption': 395.2848}
  {'month': 2, 'consumption': 334.9759}
  ...

✓ energy_avg_per_month_through_years.json (46 records)
  {'year': 1979, 'avg_monthly_consumption': 186.2367}
  {'year': 1980, 'avg_monthly_consumption': 201.2}
  ...

✓ hdd_monthly_through_years.json (552 records)
  {'year': 1979, 'month': 1, 'consumption': 458.58}
  {'year': 1979, 'month': 2, 'consumption': 328.8}
  ...
