TFT with information on (future) popualtion in each cohort and bundesland

In [1]:
import torch

# Check if CUDA is available (for NVIDIA GPUs)
cuda_available = torch.cuda.is_available()
print(f"CUDA Available: {cuda_available}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if cuda_available:
    # Get the name of the current GPU
    gpu_name = torch.cuda.get_device_name(0)
    print(f"GPU Name: {gpu_name}")

    # Get the number of available GPUs
    gpu_count = torch.cuda.device_count()
    print(f"Number of GPUs: {gpu_count}")

    # Check CUDA version
    cuda_version = torch.version.cuda
    print(f"CUDA Version: {cuda_version}")

import copy
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")
from pyreadr import read_r

import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor
from lightning.pytorch.loggers import TensorBoardLogger
import numpy as np
import pandas as pd
import torch
np.Inf = np.inf

import pickle

from pytorch_forecasting.models.temporal_fusion_transformer.tuning import (
    optimize_hyperparameters,
)

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import EncoderNormalizer
from pytorch_forecasting.metrics import QuantileLoss


CUDA Available: True
Using device: cuda
GPU Name: NVIDIA GeForce RTX 3080
Number of GPUs: 1
CUDA Version: 12.1


In [75]:
data_path = r"/data/simon/"
#data_path = r"C:\Users\simon.sarcletti\OneDrive - FH JOANNEUM\FH Joanneum - DAT\XX_Masterarbeit\05_Empirical_work\01_data\02_work"

all_munip_pop = read_r(data_path + r"all_municipalities_population.RData")[
    "all_munip_pop"
]

all_munip_pop["municipality_code"] = all_munip_pop["municipality_code"].astype("int64")

all_munip_pop["bundesland"] = (
    all_munip_pop["municipality_code"]
    .astype(str)     # in case it’s numeric
    .str[0]          # first character
    .astype(int)     # back to integer, if you want 1,2,… rather than "1","2",…
)

static_data_path = r"/data/simon/"
#static_data_path = r"C:\Users\simon.sarcletti\OneDrive - FH JOANNEUM\FH Joanneum - DAT\XX_Masterarbeit\05_Empirical_work\01_data\01_original"

static_metadata = pd.read_csv(
    static_data_path + r"static_variables.csv",
    encoding="latin-1",
    sep=";",
    decimal=",",
)


variable_metadata = read_r(data_path + r"aut_forecast_bl_sex_age_group.RData")["aut_forecast"]
variable_metadata = variable_metadata.rename(columns={"age_group": "coarse_age_group"})
variable_metadata_training = variable_metadata[variable_metadata['year'] <= 2024]
variable_metadata_prediction = variable_metadata[variable_metadata['year'] > 2024]

group_cols = ["year", "coarse_age_group", "sex"]
unique_age_sex_data = variable_metadata_training.drop_duplicates(subset=group_cols, keep='first')
unique_age_sex_data_prediction = variable_metadata_prediction.drop_duplicates(subset=group_cols, keep='first')
group_cols2 = ["year", "bundesland"]
unique_bl_data = variable_metadata_training.drop_duplicates(subset=group_cols2, keep='first')
unique_bl_data_prediction = variable_metadata_prediction.drop_duplicates(subset=group_cols2, keep='first')
merged_data = pd.merge(
    all_munip_pop,
    static_metadata,
    how="left",
    left_on="municipality_code",
    right_on="ID",
)
merged_data = pd.merge(
    merged_data,
    unique_age_sex_data[["year", "coarse_age_group", "sex", "smoothed_pop_per_age_group_sex"]],
    how="left",
    on= ["coarse_age_group", "year", "sex"]
)

merged_data = pd.merge(
    merged_data,
    unique_bl_data[["year", "bundesland", "smoothed_pop_per_bl"]],
    how="left",
    on=["year", "bundesland"]
)
# create an index col
merged_data["index"] = (
    merged_data["municipality_code"].astype(str)
    + "_"
    + merged_data["sex"].round(0).astype(str)
    + "_"
    + merged_data["coarse_age_group"]
)
# remove unnecessary columns
merged_data = merged_data.drop(
    columns=[
        "Name",
        "ID",
        "municipality_code",
        "reg_code",
        "municipality",
        "sex",
        "population",
    ]
) # maybe remove "Jahresbruttobezug_2023" as well

merged_data = merged_data[merged_data["year"] >= 2004].copy()

# create a new column with first three digits of index
merged_data["reg_code"] = merged_data["index"].str[:3]

merged_data = merged_data.rename(
    columns={"smoothed_population": "population", "coarse_age_group": "age_group"}
)
merged_data["year"] = pd.to_numeric(merged_data["year"], downcast="integer")

# drop col 'klassifikation_palme95
merged_data = merged_data.drop(columns=["klassifikation_palme95"])

static_categoricals = ['bundesland','Urban-Rural-Typologie','OeV-Güteklassen', 'Bezirkshauptstadt',
       'schulen_ue250', 'umkreis_schulen', 'haltestelle_IbIII',
       'haltestelle_umkreis', 'autobahnauffahrt', 'autobahnauffahrt_umkreis','umkreis_einpendler', 'reg_code',]

static_reals = ['Index_Pendlersaldos_2022','anteil_ue75_2014',
       'anteil_ue75_2024', 'durchschnittsalter', 'Jahresbruttobezug_2023',
       'anteil_frauen_1534_gesamtbevölkerung',
       'verkehrsleistung_personenkilometer_energiemosaik',
       'handelsgebaeude_1000ew_gwr', 'kulturgebaeude_1000ew_gwr',]

for col in static_categoricals:
    merged_data[col] = merged_data[col].astype(str)

# Prediction

In [None]:
best_model_path = "/data/lightning_logs/lightning_logs/version_3/checkpoints/epoch=49-step=2500.ckpt"

best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)

In [136]:
max_encoder_length = 25
max_prediction_length = 11

# 1) grab the last 24 months of history
encoder_data = merged_data[lambda df: df.year > df.year.max() - max_encoder_length]

last_data = merged_data[lambda x: x.year == x.year.max()]

decoder_data = pd.concat(
    [
        last_data.assign(year=lambda x, i=i: x['year'] + i)
        for i in range(1, max_prediction_length + 1)
    ],
    ignore_index=True,
)

In [138]:
decoder_data = decoder_data.drop(columns=["smoothed_pop_per_age_group_sex", "smoothed_pop_per_bl"])
unique_bl_data_prediction['bundesland'] = unique_bl_data_prediction['bundesland'].astype(str)
decoder_data['year'] = decoder_data['year'].astype(int)
decoder_data = pd.merge(
    decoder_data,
    unique_bl_data_prediction[['year', 'bundesland', 'smoothed_pop_per_bl']],
    on=['year', 'bundesland'],
    how='left'
)

In [140]:
unique_age_sex_data_prediction['bundesland'] = unique_age_sex_data_prediction['bundesland'].astype(str)
decoder_data['year'] = decoder_data['year'].astype(int)
decoder_data = pd.merge(
    decoder_data,
    unique_bl_data_prediction[['year', 'bundesland', 'smoothed_pop_per_age_group_sex']],
    on=['year', 'bundesland'],
    how='left'
)

In [None]:
decoder_data.head()

Unnamed: 0,age_group,year,population,bundesland,Urban-Rural-Typologie,OeV-Güteklassen,Bezirkshauptstadt,schulen_ue250,umkreis_schulen,haltestelle_IbIII,...,durchschnittsalter,Jahresbruttobezug_2023,anteil_frauen_1534_gesamtbevölkerung,verkehrsleistung_personenkilometer_energiemosaik,handelsgebaeude_1000ew_gwr,kulturgebaeude_1000ew_gwr,index,reg_code,smoothed_pop_per_bl,smoothed_pop_per_age_group_sex
0,0 - 9,2025,739.666667,1,103,C,1,1,1,1,...,43.79,55672,10.881087,448559000,7.732119,0.436491,10101_1.0_0 - 9,101,302536.0,447005.0
1,10 - 19,2025,789.333333,1,103,C,1,1,1,1,...,43.79,55672,10.881087,448559000,7.732119,0.436491,10101_1.0_10 - 19,101,302536.0,447005.0
2,20 - 29,2025,933.0,1,103,C,1,1,1,1,...,43.79,55672,10.881087,448559000,7.732119,0.436491,10101_1.0_20 - 29,101,302536.0,447005.0
3,30 - 44,2025,1654.333333,1,103,C,1,1,1,1,...,43.79,55672,10.881087,448559000,7.732119,0.436491,10101_1.0_30 - 44,101,302536.0,447005.0
4,45 - 54,2025,1028.333333,1,103,C,1,1,1,1,...,43.79,55672,10.881087,448559000,7.732119,0.436491,10101_1.0_45 - 54,101,302536.0,447005.0


In [None]:


# combine encoder and decoder data
new_prediction_data = pd.concat([encoder_data, decoder_data], ignore_index=True)

static_categoricals = ['Urban-Rural-Typologie', 'OeV-Güteklassen', 'Bezirkshauptstadt',
       'schulen_ue250', 'umkreis_schulen', 'haltestelle_IbIII',
       'haltestelle_umkreis', 'autobahnauffahrt', 'autobahnauffahrt_umkreis','umkreis_einpendler', 'reg_code',]

static_reals = ['Index_Pendlersaldos_2022','anteil_ue75_2014',
       'anteil_ue75_2024', 'durchschnittsalter', 'Jahresbruttobezug_2023',
       'anteil_frauen_1534_gesamtbevölkerung',
       'verkehrsleistung_personenkilometer_energiemosaik',
       'handelsgebaeude_1000ew_gwr', 'kulturgebaeude_1000ew_gwr',]

for col in static_categoricals:
    new_prediction_data[col] = new_prediction_data[col].astype(str)


In [None]:
# actual prediction
new_raw_predictions = best_tft.predict(
    new_prediction_data,
    mode="raw",
    return_x=True,
    return_index=True,
    trainer_kwargs=dict(accelerator="gpu"),
)

arr = new_raw_predictions.output.prediction.detach().cpu().numpy()



n_samples, n_steps, n_quantiles = arr.shape  # (33840, 11, 7) if 11 years, 7 quantiles
print(f"Shape of predictions: {arr.shape}")
# Repeat each sample index
original_index = np.repeat(new_raw_predictions.index["index"], n_quantiles * n_steps)

# Repeat quantiles and years
quantiles = ["0.01", "0.1", "0.25", "0.5", "0.75", "0.9", "0.99"]
quantile_column = np.tile(np.repeat(quantiles, n_steps), n_samples)
year_column = np.tile(list(range(2025, 2025 + n_steps)), n_samples * n_quantiles)

# Flatten prediction
prediction_column = arr.flatten()

# Build DataFrame
df_long = pd.DataFrame({
    "original_index": original_index,
    "quantile": quantile_column,
    "year": year_column,
    "prediction": prediction_column
})

df_long.to_csv("/home/v18y97/mt_pop_forecast/tft_prediction_2025-2035.csv", index=False)