In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import matplotlib.cm as cm
import matplotlib.dates as mdates
from scipy.signal import find_peaks
import openmeteo_requests
import requests_cache
from datetime import date, timedelta
import seaborn as sns
import numpy as np
import os
import sys
import pygam
import sklearn
sys.path.append("..")
from pygam import LinearGAM, f, s, te, l 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score, mean_squared_error

citing pyGAM: Servén D., Brummitt C. (2018). pyGAM: Generalized Additive Models in Python. Zenodo. DOI: 10.5281/zenodo.1208723

Prepare mstl data for GAM 

In [2]:
# data load 
mstl_data = pd.read_csv("../data/mstl_results_clean.csv", low_memory=False)

# Übersicht über mstl_data Columns
print(mstl_data.columns.tolist())

['timestamp', 'city', 'counter_site', 'longitude', 'latitude', 'count', 'count_capped', 'log_count', 'trend', 'seasonal_24', 'seasonal_168', 'seasonal_8766', 'residual']


In [3]:
# extrahiere benötigte Spalten für GAMS
residual_data = mstl_data[['timestamp',
                           'city',
                           'counter_site',
                           'seasonal_8766',
                           'residual']].copy()

# filtert Zeilen raus, in denen anstatt Werten die Columnnames stehen
residual_data = residual_data[residual_data['timestamp']!='timestamp']

residual_data['id'] = residual_data.groupby(['city', 'counter_site']).ngroup()
residual_data['year_plus_res'] = residual_data['residual'].astype(float).values + residual_data['seasonal_8766'].astype(float).values
residual_data.head(2)

Unnamed: 0,timestamp,city,counter_site,seasonal_8766,residual,id,year_plus_res
0,2012-12-31 23:00:00+00:00,Landeshauptstadt Stuttgart,König-Karls-Brücke Barometer,-1.8545503347710663,0.3062034432133602,5,-1.548347
1,2013-01-01 00:00:00+00:00,Landeshauptstadt Stuttgart,König-Karls-Brücke Barometer,-0.9312113949920524,0.7172102469645241,5,-0.214001


In [4]:
# Referenzzeitpunkt (UTC)
ref = pd.Timestamp(min(residual_data['timestamp'].tolist())) 
print(ref)

# 1) Spalte vektorisert in Datetime konvertieren (einmalig)
t_0 = pd.to_datetime(residual_data['timestamp'], errors='coerce').dt.tz_convert('Europe/Berlin')

# 2) Differenz in Stunden berechnen (vektorisiert)
residual_data['timestamp_number'] = (t_0 - ref).dt.total_seconds() / 3600.0
# => timestamp_number ist jetzt die Anzahl Stunden seit Referenzzeitpunkt

# 
residual_data['weekday'] = t_0.dt.dayofweek.astype(float).values            # 0–6
residual_data['hour'] = t_0.dt.hour.astype(float).values                    # 0–23

# assertion: values in weekday are in [0, 1, 2, 3, 4, 5, 6] 
assert set(residual_data['weekday']) == set(range(7))
# assertion: values in hour are in [0, 1, ..., 22, 23]
assert set(residual_data['hour']) == set(range(24))

2012-12-31 23:00:00+00:00


Prepare weather data for GAM

In [5]:
# load weather data 
weather_data = pd.read_csv("../data/weather_per_city.csv", low_memory=False)
print(weather_data.head(2))
print(weather_data.isna().sum())
print(weather_data.columns.tolist())

                        date  temperature_2m  apparent_temperature  rain  \
0  2012-12-30 23:00:00+00:00          5.8285              1.619927   0.0   
1  2012-12-31 00:00:00+00:00          5.8285              1.688656   0.0   

   snowfall  forecast_temperature_2m  forecast_apparent_temperature  \
0       0.0                      NaN                            NaN   
1       0.0                      NaN                            NaN   

   forecast_rain  forecast_snowfall                        city  
0            NaN                NaN  Landeshauptstadt Stuttgart  
1            NaN                NaN  Landeshauptstadt Stuttgart  
date                                   0
temperature_2m                         0
apparent_temperature                   0
rain                                   0
snowfall                               0
forecast_temperature_2m          1586134
forecast_apparent_temperature    1586134
forecast_rain                    1586156
forecast_snowfall              

In [6]:
cities = residual_data['city'].unique()
print(cities)

['Landeshauptstadt Stuttgart' 'Stadt Freiburg' 'Stadt Heidelberg'
 'Stadt Ludwigsburg' 'Stadt Mannheim' 'Stadt Reutlingen' 'Stadt Tübingen']


In [7]:
def split_data(X_weather: pd.DataFrame, X_forecast: pd.DataFrame, y: pd.Series, site_col: str = "id", time_col: str = "timestamp_number", train_frac: float = 0.8,):
# Splitte Daten in Trainings- und Testset basierend auf id 
    
    train_idx = []
    test_idx = []

    for site, group in X_weather.groupby(site_col):
        group = group.sort_values(time_col)
        split = int(train_frac * len(group))

        train_idx.extend(group.index[:split])
        test_idx.extend(group.index[split:])
    
    # DataFrames / Series splitten
    X_weather_train = X_weather.loc[train_idx].copy()
    X_weather_test  = X_weather.loc[test_idx].copy()

    X_forecast_train = X_forecast.loc[train_idx].copy()
    X_forecast_test  = X_forecast.loc[test_idx].copy()

    y_train = y.loc[train_idx].copy()
    y_test  = y.loc[test_idx].copy()

    assert set(X_weather_train[site_col]) == set(X_weather_test[site_col])
    assert set(X_forecast_train[site_col]) == set(X_forecast_test[site_col])

    assert len(y_train) == len(X_weather_train) == len(X_forecast_train)
    assert len(y_test) == len(X_weather_test) == len(X_forecast_test)

    return X_weather_train, X_weather_test, X_forecast_train, X_forecast_test, y_train, y_test

In [8]:
# mean und std berechnen nur auf trainingsdaten und dann auf alle Daten anwenden

# Iteriere über alle Städte
# berechne je Stadt normierte Wetterdaten
# speichere für jede Stadt die entsprechenden Daten in einem Dictionary

prepared_data_dict = {}

for city in cities:

    city_data = residual_data[residual_data['city'] == city]
    w = weather_data[weather_data['city'] == city].copy()

    # merge with weather data by city and date (in weather data) and timestamp (in city_data)
    merged_data = pd.merge(city_data, w, left_on=['city', city_data['timestamp']], right_on=['city', w['date']], how='left')

    # Features und Zielvariable definieren
    feature_cols_weather = ['timestamp_number', 'id', 'temperature_2m', 'rain', 'snowfall', 'weekday', 'hour']
    feature_cols_forecast = ['timestamp_number', 'id', 'forecast_temperature_2m', 'forecast_rain', 'forecast_snowfall', 'weekday', 'hour']
    X_weather = merged_data[feature_cols_weather].astype(float)
    X_forecast = merged_data[feature_cols_forecast].astype(float)
    y = merged_data['year_plus_res'] 


    """# One-Hot Encoding für ID-Spalte
    ids = merged_data['id'].astype(int).values
    num_ids = merged_data['id'].nunique()

    id_indicators = np.zeros((len(ids), num_ids))
    for i in range(num_ids):
        id_indicators[:, i] = (ids == (i + 1)).astype(float)
    X_weather_final = np.hstack((X_weather, id_indicators))
    X_forecast_final = np.hstack((X_forecast, id_indicators))
"""

    """# für jede ID eine Column mit 1/0 Indikator erstellen
    num_ids = merged_data['id'].nunique()
    id_indicators = np.zeros((X.shape[0], num_ids))
    for i in range(num_ids):
        id_indicators[:, i] = (X[:, 1] == i+1).astype(float)
    # concatenate id_indicators to X
    X = np.hstack((X, id_indicators))"""


    # drop rows with NaN values in X or y
    valid_idx = (y.notna() &
                X_weather.notna().all(axis=1) &
                X_forecast.notna().all(axis=1)
                )
    X_weather = X_weather.loc[valid_idx].copy()
    X_forecast = X_forecast.loc[valid_idx].copy()
    y = y.loc[valid_idx].copy()


    X_weather_train, X_weather_test, X_forecast_train, X_forecast_test, y_train, y_test = split_data(X_weather, X_forecast, y)
    X_weather_normalized = X_weather.copy()
    X_forecast_normalized = X_forecast.copy()

    # Normalisiere Wetterdaten
    for col in ['temperature_2m', 'rain', 'snowfall']:
        mean = X_weather_train[col].mean()
        std = X_weather_train[col].std()

        X_weather_train[col] = (X_weather_train[col] - mean) / std
        X_weather_test[col]  = (X_weather_test[col] - mean) / std

        X_weather_normalized[col] = (X_weather_normalized[col] - mean) / std

    for col in ['forecast_temperature_2m', 'forecast_rain', 'forecast_snowfall']:
        mean = X_forecast_train[col].mean()
        std = X_forecast_train[col].std()

        X_forecast_train[col] = (X_forecast_train[col] - mean) / std
        X_forecast_test[col]  = (X_forecast_test[col] - mean) / std

        X_forecast_normalized[col] = (X_forecast_normalized[col] - mean) / std


    # speichere Featurenames:
    feature_names_weather = X_weather_train.columns.tolist()
    feature_names_forecast = X_forecast_train.columns.tolist()

    # convert to numpy arrays
    X_weather_train = X_weather_train.values
    X_weather_test  = X_weather_test.values

    X_forecast_train = X_forecast_train.values
    X_forecast_test = X_forecast_test.values

    y_train = y_train.values
    y_test  = y_test.values

    prepared_data_dict[city] = {
        'X_weather_train': X_weather_train,
        'X_weather_test': X_weather_test,
        'X_forecast_train': X_forecast_train,
        'X_forecast_test': X_forecast_test,
        'y_train': y_train,
        'y_test': y_test,
        'feature_cols_weather': feature_names_weather,
        'feature_cols_forecast': feature_names_forecast, 
        'X_weather_normalized': X_weather_normalized,
        'X_forecast_normalized': X_forecast_normalized
    
    }

    assert len(X_weather_train) == len(y_train)
    assert len(X_weather_test) == len(y_test)
    assert len(X_forecast_train) == len(y_train)
    assert len(X_forecast_test) == len(y_test)  
    assert len(X_weather_train) + len(X_weather_test) == len(X_weather_normalized)
    assert len(X_forecast_train) + len(X_forecast_test) == len(X_forecast_normalized)

In [9]:
prepared_data_dict['Stadt Tübingen']['feature_cols_weather']

['timestamp_number',
 'id',
 'temperature_2m',
 'rain',
 'snowfall',
 'weekday',
 'hour']

In [10]:
print(prepared_data_dict['Stadt Tübingen']['X_weather_train'].shape)
print(prepared_data_dict['Stadt Tübingen']['X_weather_test'].shape)
print(X_weather_normalized.shape)
#print(X_weather_final.shape)
#print(X_forecast_final.shape)

(67375, 7)
(16845, 7)
(84220, 7)


# Natalies Modell

In [11]:
#cities = ["Stadt Tübingen", "Stadt Heidelberg"]
metrics_dict = {}
city_models = {"observed": {}, "forecast": {}}

# iteriere über alle Städte
# trainiere jeweils ein GAM auf allen Daten der Stadt, auf den tatsächlichen Wetterdaten und auf den Wettervorhersagen
# berechne Metriken und speichere sie in einem Dictionary
for city in cities:
    X_weather_train = prepared_data_dict[city]['X_weather_train']
    X_weather_test  = prepared_data_dict[city]['X_weather_test']

    X_forecast_train = prepared_data_dict[city]['X_forecast_train']
    X_forecast_test  = prepared_data_dict[city]['X_forecast_test']

    y_train = prepared_data_dict[city]['y_train']
    y_test  = prepared_data_dict[city]['y_test']

    X_both_train = np.concatenate([X_weather_train, X_forecast_train[:, 2:5]], axis=1)
    X_both_test = np.concatenate([X_weather_test, X_forecast_test[:, 2:5]], axis=1)


    # GAM Modell definieren
    gam_1 = LinearGAM(
        f(1) +                        # ID als Faktorvariable
        # Wetterdaten
        s(2, n_splines=6) +           # temp
        s(3, n_splines=6) +           # rain
        s(4, n_splines=6) +           # snow

        # Interaktionen mit Zeit
        te(2, 3, n_splines=6) +                             # temp × rain
        te(2, 5, n_splines=[6, 7], basis=['ps', 'cp']) +    # temp × weekday (zyklisch)
        te(3, 5, n_splines=[6, 7], basis=['ps', 'cp']) +    # rain × weekday (zyklisch)
        te(2, 6, n_splines=[6, 7], basis=['ps', 'cp']) +    # temp × hour (zyklisch)
        te(3, 6, n_splines=[6, 7], basis=['ps', 'cp']) #+    # rain × hour (zyklisch)

        # unterschiede je nach ID
        #s(2, by=1, n_splines=4) +   # temp nach ID
        #s(3, by=1, n_splines=4)     # rain nach ID
        )
    gam_1.gridsearch(X_weather_train, y_train)
    

    gam_2 = LinearGAM(
        f(1) +                        # ID als Faktorvariable
        # Wetterdaten
        s(2, n_splines=6) +           # temp
        s(3, n_splines=6) +           # rain
        s(4, n_splines=6) +           # snow

        # Interaktionen mit Zeit
        te(2, 3, n_splines=6) +                             # temp × rain
        te(2, 5, n_splines=[6, 7], basis=['ps', 'cp']) +    # temp × weekday (zyklisch)
        te(3, 5, n_splines=[6, 7], basis=['ps', 'cp']) +    # rain × weekday (zyklisch)
        te(2, 6, n_splines=[6, 7], basis=['ps', 'cp']) +    # temp × hour (zyklisch)
        te(3, 6, n_splines=[6, 7], basis=['ps', 'cp']) #+    # rain × hour (zyklisch)

        # unterschiede je nach ID
        #s(2, by=1, n_splines=4) +   # temp nach ID
        #s(3, by=1, n_splines=4)     # rain nach ID
        )
    gam_2.gridsearch(X_forecast_train, y_train)

    """gam_3 = LinearGAM(
        f(1) +                        # ID als Faktorvariable
        # Ist-Wetterdaten
        s(2, n_splines=6) +           # temp
        s(3, n_splines=6) +           # rain
        s(4, n_splines=6) +           # snow
        # Forecast-Wetterdaten
        s(7, n_splines=6) +           # forecast_temp
        s(8, n_splines=6) +           # forecast_rain
        s(9, n_splines=6) +           # forecast_snow
        # Interaktionen mit Zeit
        te(2, 3, n_splines=6) +                             # temp × rain
        te(2, 5, n_splines=[6, 7], basis=['ps', 'cp']) +    # temp × weekday (zyklisch)
        te(3, 5, n_splines=[6, 7], basis=['ps', 'cp']) +    # rain × weekday (zyklisch)
        te(2, 6, n_splines=[6, 7], basis=['ps', 'cp']) +    # temp × hour (zyklisch)
        te(3, 6, n_splines=[6, 7], basis=['ps', 'cp']) +    # rain × hour (zyklisch)
        # Interaktionen mit der Zeit forecast
        te(7, 8, n_splines=6) +                             # forecast_temp × forecast_rain
        te(7, 5, n_splines=[6, 7], basis=['ps', 'cp']) +    # forecast_temp × weekday (zyklisch)
        te(8, 5, n_splines=[6, 7], basis=['ps', 'cp']) +    # forecast_rain × weekday (zyklisch)
        te(7, 6, n_splines=[6, 7], basis=['ps', 'cp']) +    # forecast_temp × hour (zyklisch)
        te(8, 6, n_splines=[6, 7], basis=['ps', 'cp']) #+    # forecast_rain × hour (zyklisch)
        # unterschiede je nach ID
        #s(2, by=1, n_splines=4) +   # temp nach ID
        #s(3, by=1, n_splines=4) +   # rain nach ID
        #s(7, by=1, n_splines=4) +   # forecast_temp nach ID
        #s(8, by=1, n_splines=4)     # forecast_rain nach ID
    )
    gam_3.gridsearch(X_both_train, y_train)"""

    # ------------------------------------------------
    # save models for plots
    city_models["observed"][city] = {
        "gam": gam_1,
        "X": prepared_data_dict[city]["X_weather_normalized"],
        "y": y
    }
    city_models["forecast"][city] = {
        "gam": gam_2,
        "X": prepared_data_dict[city]["X_forecast_normalized"],
        "y": y
    }

    # ------------------------------------------------
    # berechne Metriken und speichere sie in einem Dictionary

    stats_1 = gam_1.statistics_
    y_test_hat_1 = gam_1.predict(X_weather_test)
    y_train_hat_1 = gam_1.predict(X_weather_train)

    metrics_dict[city + ' weather'] = {
        'lambdas': gam_1.lam,
        'AIC': stats_1.get('AIC'),
        'edof': stats_1.get('edof'),
        'GCV': stats_1.get('GCV'),
        #'pseudo_R2': stats_1.get('pseudo_R2').get('explained_deviance'),
        'r2_train': r2_score(y_train, y_train_hat_1),
        'r2_test': r2_score(y_test, y_test_hat_1),
        'rmse_train': np.sqrt(mean_squared_error(y_train, y_train_hat_1)),
        'rmse_test': np.sqrt(mean_squared_error(y_test, y_test_hat_1)),
        'y_train_hat': y_train_hat_1,
        'y_test_hat': y_test_hat_1
    }

    stats_2 = gam_2.statistics_
    y_test_hat_2 = gam_2.predict(X_forecast_test)
    y_train_hat_2 = gam_2.predict(X_forecast_train)

    metrics_dict[city + ' forecast'] = {
        'lambdas': gam_2.lam,
        'AIC': stats_2.get('AIC'),
        'edof': stats_2.get('edof'),
        'GCV': stats_2.get('GCV'),
        #'pseudo_R2': stats_2.get('pseudo_R2').get('explained_deviance'),
        'r2_train': r2_score(y_train, y_train_hat_2),
        'r2_test': r2_score(y_test, y_test_hat_2),
        'rmse_train': np.sqrt(mean_squared_error(y_train, y_train_hat_2)),
        'rmse_test': np.sqrt(mean_squared_error(y_test, y_test_hat_2)),
        'y_train_hat': y_train_hat_2,
        'y_test_hat': y_test_hat_2
    }
    """
    stats_3 = gam_3.statistics_
    y_test_hat_3 = gam_3.predict(X_both_test)
    y_train_hat_3 = gam_3.predict(X_both_train)

    metrics_dict[city + ' both'] = {
        'lambdas': gam_3.lam,
        'AIC': stats_3.get('AIC'),
        'edof': stats_3.get('edof'),
        'GCV': stats_3.get('GCV'),
       # 'pseudo_R2': stats_3.get('pseudo_R2').get('explained_deviance'),
        'r2_train': r2_score(y_train, y_train_hat_3),
        'r2_test': r2_score(y_test, y_test_hat_3),
        'rmse_train': np.sqrt(mean_squared_error(y_train, y_train_hat_3)),
        'rmse_test': np.sqrt(mean_squared_error(y_test, y_test_hat_3)),
        'y_train_hat': y_train_hat_3,
        'y_test_hat': y_test_hat_3
    }"""
    print(f"Completed GAM training and metrics calculation for city: {city}")

  0% (0 of 11) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
  9% (1 of 11) |##                       | Elapsed Time: 0:00:21 ETA:   0:03:32
 18% (2 of 11) |####                     | Elapsed Time: 0:00:43 ETA:   0:03:15
 27% (3 of 11) |######                   | Elapsed Time: 0:01:15 ETA:   0:03:20
 36% (4 of 11) |#########                | Elapsed Time: 0:01:42 ETA:   0:02:59
 45% (5 of 11) |###########              | Elapsed Time: 0:02:13 ETA:   0:02:39
 54% (6 of 11) |#############            | Elapsed Time: 0:02:39 ETA:   0:02:13
 63% (7 of 11) |###############          | Elapsed Time: 0:03:06 ETA:   0:01:46
 72% (8 of 11) |##################       | Elapsed Time: 0:03:34 ETA:   0:01:20
 81% (9 of 11) |####################     | Elapsed Time: 0:04:03 ETA:   0:00:54
 90% (10 of 11) |#####################   | Elapsed Time: 0:04:29 ETA:   0:00:26
100% (11 of 11) |########################| Elapsed Time: 0:04:52 Time:  0:04:52
  0% (0 of 11) |                        

Completed GAM training and metrics calculation for city: Landeshauptstadt Stuttgart


  9% (1 of 11) |##                       | Elapsed Time: 0:00:03 ETA:   0:00:36
 18% (2 of 11) |####                     | Elapsed Time: 0:00:07 ETA:   0:00:34
 27% (3 of 11) |######                   | Elapsed Time: 0:00:11 ETA:   0:00:31
 36% (4 of 11) |#########                | Elapsed Time: 0:00:15 ETA:   0:00:27
 45% (5 of 11) |###########              | Elapsed Time: 0:00:19 ETA:   0:00:23
 54% (6 of 11) |#############            | Elapsed Time: 0:00:23 ETA:   0:00:19
 63% (7 of 11) |###############          | Elapsed Time: 0:00:27 ETA:   0:00:15
 72% (8 of 11) |##################       | Elapsed Time: 0:00:31 ETA:   0:00:11
 81% (9 of 11) |####################     | Elapsed Time: 0:00:35 ETA:   0:00:07
 90% (10 of 11) |#####################   | Elapsed Time: 0:00:39 ETA:   0:00:03
100% (11 of 11) |########################| Elapsed Time: 0:00:43 Time:  0:00:43
  0% (0 of 11) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
  9% (1 of 11) |##                      

Completed GAM training and metrics calculation for city: Stadt Freiburg


  9% (1 of 11) |##                       | Elapsed Time: 0:00:10 ETA:   0:01:48
 18% (2 of 11) |####                     | Elapsed Time: 0:00:21 ETA:   0:01:38
 27% (3 of 11) |######                   | Elapsed Time: 0:00:32 ETA:   0:01:27
 36% (4 of 11) |#########                | Elapsed Time: 0:00:43 ETA:   0:01:16
 45% (5 of 11) |###########              | Elapsed Time: 0:00:55 ETA:   0:01:06
 54% (6 of 11) |#############            | Elapsed Time: 0:01:06 ETA:   0:00:55
 63% (7 of 11) |###############          | Elapsed Time: 0:01:17 ETA:   0:00:44
 72% (8 of 11) |##################       | Elapsed Time: 0:01:28 ETA:   0:00:33
 81% (9 of 11) |####################     | Elapsed Time: 0:01:39 ETA:   0:00:22
 90% (10 of 11) |#####################   | Elapsed Time: 0:01:50 ETA:   0:00:11
100% (11 of 11) |########################| Elapsed Time: 0:02:02 Time:  0:02:02
  0% (0 of 11) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
  9% (1 of 11) |##                      

Completed GAM training and metrics calculation for city: Stadt Heidelberg


  9% (1 of 11) |##                       | Elapsed Time: 0:00:19 ETA:   0:03:12
 18% (2 of 11) |####                     | Elapsed Time: 0:00:38 ETA:   0:02:54
 27% (3 of 11) |######                   | Elapsed Time: 0:00:57 ETA:   0:02:34
 36% (4 of 11) |#########                | Elapsed Time: 0:01:17 ETA:   0:02:14
 45% (5 of 11) |###########              | Elapsed Time: 0:01:36 ETA:   0:01:55
 54% (6 of 11) |#############            | Elapsed Time: 0:01:55 ETA:   0:01:35
 63% (7 of 11) |###############          | Elapsed Time: 0:02:14 ETA:   0:01:16
 72% (8 of 11) |##################       | Elapsed Time: 0:02:33 ETA:   0:00:57
 81% (9 of 11) |####################     | Elapsed Time: 0:02:52 ETA:   0:00:38
 90% (10 of 11) |#####################   | Elapsed Time: 0:03:11 ETA:   0:00:19
100% (11 of 11) |########################| Elapsed Time: 0:03:30 Time:  0:03:30
  0% (0 of 11) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
  9% (1 of 11) |##                      

Completed GAM training and metrics calculation for city: Stadt Ludwigsburg


  9% (1 of 11) |##                       | Elapsed Time: 0:00:11 ETA:   0:01:55
 18% (2 of 11) |####                     | Elapsed Time: 0:00:23 ETA:   0:01:45
 27% (3 of 11) |######                   | Elapsed Time: 0:00:35 ETA:   0:01:34
 36% (4 of 11) |#########                | Elapsed Time: 0:00:47 ETA:   0:01:22
 45% (5 of 11) |###########              | Elapsed Time: 0:00:59 ETA:   0:01:11
 54% (6 of 11) |#############            | Elapsed Time: 0:01:11 ETA:   0:00:59
 63% (7 of 11) |###############          | Elapsed Time: 0:01:23 ETA:   0:00:47
 72% (8 of 11) |##################       | Elapsed Time: 0:01:35 ETA:   0:00:35
 81% (9 of 11) |####################     | Elapsed Time: 0:01:47 ETA:   0:00:23
 90% (10 of 11) |#####################   | Elapsed Time: 0:01:59 ETA:   0:00:11
100% (11 of 11) |########################| Elapsed Time: 0:02:11 Time:  0:02:11
  0% (0 of 11) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
  9% (1 of 11) |##                      

Completed GAM training and metrics calculation for city: Stadt Mannheim


  9% (1 of 11) |##                       | Elapsed Time: 0:00:11 ETA:   0:01:50
 18% (2 of 11) |####                     | Elapsed Time: 0:00:22 ETA:   0:01:39
 27% (3 of 11) |######                   | Elapsed Time: 0:00:33 ETA:   0:01:28
 36% (4 of 11) |#########                | Elapsed Time: 0:00:44 ETA:   0:01:17
 45% (5 of 11) |###########              | Elapsed Time: 0:00:55 ETA:   0:01:06
 54% (6 of 11) |#############            | Elapsed Time: 0:01:07 ETA:   0:00:56
 63% (7 of 11) |###############          | Elapsed Time: 0:01:20 ETA:   0:00:45
 72% (8 of 11) |##################       | Elapsed Time: 0:01:31 ETA:   0:00:34
 81% (9 of 11) |####################     | Elapsed Time: 0:01:43 ETA:   0:00:22
 90% (10 of 11) |#####################   | Elapsed Time: 0:01:56 ETA:   0:00:11
100% (11 of 11) |########################| Elapsed Time: 0:02:08 Time:  0:02:08
  0% (0 of 11) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
  9% (1 of 11) |##                      

Completed GAM training and metrics calculation for city: Stadt Reutlingen


  9% (1 of 11) |##                       | Elapsed Time: 0:00:02 ETA:   0:00:25
 18% (2 of 11) |####                     | Elapsed Time: 0:00:05 ETA:   0:00:24
 27% (3 of 11) |######                   | Elapsed Time: 0:00:08 ETA:   0:00:22
 36% (4 of 11) |#########                | Elapsed Time: 0:00:11 ETA:   0:00:19
 45% (5 of 11) |###########              | Elapsed Time: 0:00:14 ETA:   0:00:16
 54% (6 of 11) |#############            | Elapsed Time: 0:00:16 ETA:   0:00:14
 63% (7 of 11) |###############          | Elapsed Time: 0:00:19 ETA:   0:00:11
 72% (8 of 11) |##################       | Elapsed Time: 0:00:22 ETA:   0:00:08
 81% (9 of 11) |####################     | Elapsed Time: 0:00:25 ETA:   0:00:05
 90% (10 of 11) |#####################   | Elapsed Time: 0:00:28 ETA:   0:00:02
100% (11 of 11) |########################| Elapsed Time: 0:00:31 Time:  0:00:31
  0% (0 of 11) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
  9% (1 of 11) |##                      

Completed GAM training and metrics calculation for city: Stadt Tübingen


In [12]:
import copy 
import json

metrics_json = copy.deepcopy(metrics_dict)
# dann nur metrics_json umwandeln

for city_key, city_metrics in metrics_json.items():
    for k in ["y_train_hat", "y_test_hat"]:
        if k in city_metrics and isinstance(city_metrics[k], np.ndarray):
            city_metrics[k] = city_metrics[k].tolist()

with open("metrics.json", "w") as f:
    json.dump(metrics_json, f, indent=4)            

In [13]:
from utils.gam_result_utils import save_temp_effect_results, save_rain_effect_results

# Expected structure of city_models:## city_models = {
#     "observed": {
#         "<city_name>": {
#             "gam": <trained GAM model>,
#             "X": np.ndarray of shape (n_samples, n_features),
#             "y": np.ndarray of shape (n_samples,)   # optional for plotting
#         },
#         ...
#     },
#     "forecast": {
#         "<city_name>": {
#             "gam": <trained GAM model>,
#             "X": np.ndarray of shape (n_samples, n_features),
#             "y": np.ndarray of shape (n_samples,)
# optional for plotting
#         },
#         ...
#     }
# }
save_temp_effect_results(city_models, out_path="../data/temp_effect_results.pkl", temp_col=2, rain_col=3, snow_col=4)
save_rain_effect_results(city_models, out_path="../data/rain_effect_results.pkl")

  X_base[temp_col] = global_mean_temp
  X_base[rain_col] = 0.0
  X_base[snow_col] = snow_fixed


Saved temperature effect results to ../data/temp_effect_results.pkl


  X_base[snow_col] = snow_fixed
  X_base_temp[temp_col] = T
  X_base_temp[rain_col] = 0.0


Saved rain effect results to ../data/rain_effect_results.pkl


In [14]:
# print overview of metrics
for key, value in metrics_dict.items():
    print(f"{key}:")
    for metric, metric_value in value.items():
        if metric != 'lambdas':  # Skip printing lambdas for brevity
            print(f"  {metric}: {metric_value}")
    print()

Landeshauptstadt Stuttgart weather:
  AIC: 614775.645948556
  edof: 132.9395955290275
  GCV: 0.20405890869879081
  r2_train: 0.3784127317875543
  r2_test: 0.35194038087129487
  rmse_train: 0.4515580860278547
  rmse_test: 0.42970313864282766
  y_train_hat: [-0.26503225 -0.25151786 -0.23127566 ... -0.16470612 -0.13598063
 -0.20434688]
  y_test_hat: [-0.46273063 -0.49471938 -0.52336929 ... -0.37202941 -0.35942416
 -0.34702651]

Landeshauptstadt Stuttgart forecast:
  AIC: 608708.3107169901
  edof: 126.68992334152122
  GCV: 0.20155830024159016
  r2_train: 0.386008052950082
  r2_test: 0.35172057755304853
  rmse_train: 0.4487907591109719
  rmse_test: 0.4297760039823768
  y_train_hat: [-0.31596634 -0.29566958 -0.27540366 ... -0.22393245 -0.15566739
 -0.21172502]
  y_test_hat: [-0.41603784 -0.4323226  -0.45249828 ... -0.38379908 -0.36313845
 -0.3560189 ]

Stadt Freiburg weather:
  AIC: 63326.38763313735
  edof: 113.61921315224426
  GCV: 0.11591952270775645
  r2_train: 0.2651506436936383
  r2_te

In [16]:
"""# print all lambdas
for key, value in metrics_dict.items():
    print(f"{key} lambdas: {value['lambdas']}")"""

'# print all lambdas\nfor key, value in metrics_dict.items():\n    print(f"{key} lambdas: {value[\'lambdas\']}")'

In [17]:
# Alle Metriken als Tabelle anzeigen
metrics_rows = []

# ein Dataframe mit allen Metriken erstellen
for key, value in metrics_dict.items():
    row = {'Model': key}
    for metric, metric_value in value.items():
        # Überspringe die Predictions
        if metric not in ['y_train_hat', 'y_test_hat', 'lambdas']:
            row[metric] = metric_value
    metrics_rows.append(row)

metrics_df = pd.DataFrame(metrics_rows)

# Sortiere nach Stadt und Modelltyp
metrics_df = metrics_df.sort_values('Model')

# Runde numerische Werte für bessere Lesbarkeit
numeric_cols = metrics_df.select_dtypes(include=[np.number]).columns
metrics_df[numeric_cols] = metrics_df[numeric_cols].round(4)

# Zeige die Tabelle
display(metrics_df)

Unnamed: 0,Model,AIC,edof,GCV,r2_train,r2_test,rmse_train,rmse_test
1,Landeshauptstadt Stuttgart forecast,608708.3107,126.6899,0.2016,0.386,0.3517,0.4488,0.4298
0,Landeshauptstadt Stuttgart weather,614775.6459,132.9396,0.2041,0.3784,0.3519,0.4516,0.4297
3,Stadt Freiburg forecast,63823.6086,108.318,0.1165,0.2611,0.0711,0.3408,0.3346
2,Stadt Freiburg weather,63326.3876,113.6192,0.1159,0.2652,0.1495,0.3399,0.3202
5,Stadt Heidelberg forecast,211747.7526,126.2528,0.1361,0.2413,0.0096,0.3687,0.6591
4,Stadt Heidelberg weather,210440.5293,132.9943,0.1354,0.2453,0.016,0.3677,0.657
7,Stadt Ludwigsburg forecast,507815.0469,127.0991,0.1844,0.3537,0.3297,0.4293,0.4155
6,Stadt Ludwigsburg weather,513198.6446,133.6228,0.1867,0.3458,0.3211,0.4319,0.4182
9,Stadt Mannheim forecast,247142.8227,128.4995,0.1434,0.2588,-0.3563,0.3784,0.619
8,Stadt Mannheim weather,248119.8945,132.1135,0.1439,0.2562,0.0341,0.3791,0.5224


In [18]:
# formatierte Ausgabe vom metrics data frame
print(metrics_df.to_string(index=False))

                              Model         AIC     edof    GCV  r2_train  r2_test  rmse_train  rmse_test
Landeshauptstadt Stuttgart forecast 608708.3107 126.6899 0.2016    0.3860   0.3517      0.4488     0.4298
 Landeshauptstadt Stuttgart weather 614775.6459 132.9396 0.2041    0.3784   0.3519      0.4516     0.4297
            Stadt Freiburg forecast  63823.6086 108.3180 0.1165    0.2611   0.0711      0.3408     0.3346
             Stadt Freiburg weather  63326.3876 113.6192 0.1159    0.2652   0.1495      0.3399     0.3202
          Stadt Heidelberg forecast 211747.7526 126.2528 0.1361    0.2413   0.0096      0.3687     0.6591
           Stadt Heidelberg weather 210440.5293 132.9943 0.1354    0.2453   0.0160      0.3677     0.6570
         Stadt Ludwigsburg forecast 507815.0469 127.0991 0.1844    0.3537   0.3297      0.4293     0.4155
          Stadt Ludwigsburg weather 513198.6446 133.6228 0.1867    0.3458   0.3211      0.4319     0.4182
            Stadt Mannheim forecast 247142.822

Mean und std für alle 3 Modelle über die Städte hinweg bilden für alle Metriken

In [19]:
# Erstelle ein DataFrame mit Mean und Std für alle 3 Modelle über Städte hinweg
metrics_df['City'] = metrics_df['Model'].str.rsplit(' ', n=1).str[0]
metrics_df['Type'] = metrics_df['Model'].str.rsplit(' ', n=1).str[1]

# Gruppiere nach Modelltyp und berechne Mean und Std
summary_stats = []

for model_type in ['weather', 'forecast', 'both']:
    model_data = metrics_df[metrics_df['Type'] == model_type]
    
    # Berechne Mean und Std für jede Metrik
    for metric in metrics_df.columns:
        if metric not in ['Model', 'City', 'Type']:
            mean_val = model_data[metric].mean()
            std_val = model_data[metric].std()
            
            summary_stats.append({
                'Model_Type': model_type,
                'Metric': metric,
                'Mean': mean_val,
                'Std': std_val
            })

summary_df = pd.DataFrame(summary_stats)

# Runde auf 4 Dezimalstellen
summary_df['Mean'] = summary_df['Mean'].round(4)
summary_df['Std'] = summary_df['Std'].round(4)

# Zeige die Zusammenfassung
display(summary_df)



Unnamed: 0,Model_Type,Metric,Mean,Std
0,weather,AIC,286557.4723,212021.2147
1,weather,edof,128.5106,7.272
2,weather,GCV,0.1582,0.0339
3,weather,r2_train,0.2976,0.0618
4,weather,r2_test,0.1904,0.1413
5,weather,rmse_train,0.3955,0.0427
6,weather,rmse_test,0.4508,0.1085
7,forecast,AIC,285400.2128,209242.9116
8,forecast,edof,120.4168,9.7012
9,forecast,GCV,0.1581,0.0329


In [20]:
# Pivot-Tabelle für bessere Lesbarkeit
pivot_summary = summary_df.pivot_table(index='Metric', columns='Model_Type', values=['Mean', 'Std'])
print("\nZusammenfassung Mean und Std nach Modelltyp:")
print(pivot_summary.round(4))


Zusammenfassung Mean und Std nach Modelltyp:
                   Mean                       Std             
Model_Type     forecast      weather     forecast      weather
Metric                                                        
AIC         285400.2128  286557.4723  209242.9116  212021.2147
GCV              0.1581       0.1582       0.0329       0.0339
edof           120.4168     128.5106       9.7012       7.2720
r2_test          0.1189       0.1904       0.2506       0.1413
r2_train         0.2972       0.2976       0.0649       0.0618
rmse_test        0.4678       0.4508       0.1214       0.1085
rmse_train       0.3954       0.3955       0.0414       0.0427
