In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import matplotlib.cm as cm
import matplotlib.dates as mdates
from scipy.signal import find_peaks
import openmeteo_requests
import requests_cache
from datetime import date, timedelta
import seaborn as sns
import numpy as np
import os
import sys
import pygam
import sklearn
sys.path.append("..")
from pygam import LinearGAM, f, s, te, l 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score, mean_squared_error

citing pyGAM: Servén D., Brummitt C. (2018). pyGAM: Generalized Additive Models in Python. Zenodo. DOI: 10.5281/zenodo.1208723

Prepare mstl data for GAM 

In [78]:
# data load 
mstl_data = pd.read_csv("../data/mstl_results_clean.csv", low_memory=False)

# Übersicht über mstl_data Columns
print(mstl_data.columns.tolist())

['timestamp', 'city', 'counter_site', 'longitude', 'latitude', 'count', 'count_capped', 'log_count', 'trend', 'seasonal_24', 'seasonal_168', 'seasonal_8766', 'residual']


In [77]:
# extrahiere benötigte Spalten für GAMS
residual_data = mstl_data[['timestamp',
                           'city',
                           'counter_site',
                           'seasonal_8766',
                           'residual']].copy()

# filtert Zeilen raus, in denen anstatt Werten die Columnnames stehen
residual_data = residual_data[residual_data['timestamp']!='timestamp']

residual_data['id'] = residual_data.groupby(['city', 'counter_site']).ngroup()
residual_data['year_plus_res'] = residual_data['residual'].astype(float).values + residual_data['seasonal_8766'].astype(float).values
residual_data.head(2)


Unnamed: 0,timestamp,city,counter_site,seasonal_8766,residual,id,year_plus_res
0,2012-12-31 23:00:00+00:00,Landeshauptstadt Stuttgart,König-Karls-Brücke Barometer,-1.8545503347710663,0.3062034432133602,5,-1.548347
1,2013-01-01 00:00:00+00:00,Landeshauptstadt Stuttgart,König-Karls-Brücke Barometer,-0.9312113949920524,0.7172102469645241,5,-0.214001


In [79]:
# Referenzzeitpunkt (UTC)
ref = pd.Timestamp(min(residual_data['timestamp'].tolist())) 
print(ref)

# 1) Spalte vektorisert in Datetime konvertieren (einmalig)
t_0 = pd.to_datetime(residual_data['timestamp'], errors='coerce').dt.tz_convert('Europe/Berlin')

# 2) Differenz in Stunden berechnen (vektorisiert)
residual_data['timestamp_number'] = (t_0 - ref).dt.total_seconds() / 3600.0
# => timestamp_number ist jetzt die Anzahl Stunden seit Referenzzeitpunkt

# 
residual_data['weekday'] = t_0.dt.dayofweek.astype(float).values            # 0–6
residual_data['hour'] = t_0.dt.hour.astype(float).values                    # 0–23

# assertion: values in weekday are in [0, 1, 2, 3, 4, 5, 6] 
assert set(residual_data['weekday']) == set(range(7))
# assertion: values in hour are in [0, 1, ..., 22, 23]
assert set(residual_data['hour']) == set(range(24))

2012-12-31 23:00:00+00:00


In [80]:
ids = residual_data['id'].astype(int).values
num_ids = residual_data['id'].nunique()
#len_data = len(residual_data) 

print(min(ids), max(ids), num_ids)
print(residual_data.shape)
# add One-Hot Encoding for column 'id' to residual_data
id_indicators = np.zeros((residual_data.shape[0], num_ids))
for i in range(num_ids):
    id_indicators[:, i] = (ids == i).astype(float)
#residual_data und id_indicators haben jetzt gleiche Anzahl Zeilen
assert residual_data.shape[0] == id_indicators.shape[0]

residual_data = residual_data.reset_index(drop=True)

residual_data = pd.concat(
    [residual_data,
     pd.DataFrame(id_indicators, columns=[f'id_{i}' for i in range(num_ids)])],
    axis=1
)
#residual_data = pd.concat([residual_data, pd.DataFrame(id_indicators, columns=[f'id_{i}' for i in range(num_ids)])], axis=1)
#X_weather_final = np.hstack((X_weather, id_indicators))
#X_forecast_final = np.hstack((X_forecast, id_indicators))
# assert: die id_indicator spalten ergeben zusammen 1 pro zeile
#print()
#assert all(residual_data[[f'id_{i+1}' for i in range(num_ids)]].sum(axis=1) == 1)

print(residual_data.shape)

"""# für jede ID eine Column mit 1/0 Indikator erstellen
num_ids = merged_data['id'].nunique()
id_indicators = np.zeros((X.shape[0], num_ids))
for i in range(num_ids):
    id_indicators[:, i] = (X[:, 1] == i+1).astype(float)
# concatenate id_indicators to X
X = np.hstack((X, id_indicators))"""

print(residual_data.head(3))

row_sums = residual_data.filter(like='id_').sum(axis=1)

print("Min:", row_sums.min())
print("Max:", row_sums.max())

bad_rows = residual_data[row_sums != 1]
print(bad_rows.shape)
print(bad_rows)


0 76 77
(3678215, 10)
(3678215, 87)
                   timestamp                        city  \
0  2012-12-31 23:00:00+00:00  Landeshauptstadt Stuttgart   
1  2013-01-01 00:00:00+00:00  Landeshauptstadt Stuttgart   
2  2013-01-01 01:00:00+00:00  Landeshauptstadt Stuttgart   

                   counter_site        seasonal_8766             residual  id  \
0  König-Karls-Brücke Barometer  -1.8545503347710663  0.30620344321336024   5   
1  König-Karls-Brücke Barometer  -0.9312113949920525   0.7172102469645241   5   
2  König-Karls-Brücke Barometer   -1.151514238943645   0.6027354440757091   5   

   year_plus_res  timestamp_number  weekday  hour  ...  id_67  id_68  id_69  \
0      -1.548347               0.0      1.0   0.0  ...    0.0    0.0    0.0   
1      -0.214001               1.0      1.0   1.0  ...    0.0    0.0    0.0   
2      -0.548779               2.0      1.0   2.0  ...    0.0    0.0    0.0   

   id_70  id_71  id_72  id_73  id_74  id_75  id_76  
0    0.0    0.0    0.0    0.

Prepare weather data for GAM

In [81]:
# load weather data 
weather_data = pd.read_csv("../data/weather_per_city.csv", low_memory=False)
print(weather_data.head(2))
print(weather_data.isna().sum())
print(weather_data.columns.tolist())

                        date  temperature_2m  apparent_temperature  rain  \
0  2012-12-30 23:00:00+00:00          5.8285              1.619927   0.0   
1  2012-12-31 00:00:00+00:00          5.8285              1.688656   0.0   

   snowfall  forecast_temperature_2m  forecast_apparent_temperature  \
0       0.0                      NaN                            NaN   
1       0.0                      NaN                            NaN   

   forecast_rain  forecast_snowfall                        city  
0            NaN                NaN  Landeshauptstadt Stuttgart  
1            NaN                NaN  Landeshauptstadt Stuttgart  
date                                   0
temperature_2m                         0
apparent_temperature                   0
rain                                   0
snowfall                               0
forecast_temperature_2m          1586134
forecast_apparent_temperature    1586134
forecast_rain                    1586156
forecast_snowfall              

In [104]:
cities = residual_data['city'].unique()
print(cities)

['Landeshauptstadt Stuttgart' 'Stadt Freiburg' 'Stadt Heidelberg'
 'Stadt Ludwigsburg' 'Stadt Mannheim' 'Stadt Reutlingen' 'Stadt Tübingen']


In [83]:
def split_data(X_weather: pd.DataFrame, X_forecast: pd.DataFrame, y: pd.Series, site_col: str = "id", time_col: str = "timestamp_number", train_frac: float = 0.8,):
# Splitte Daten in Trainings- und Testset basierend auf id 
    
    train_idx = []
    test_idx = []

    for site, group in X_weather.groupby(site_col):
        group = group.sort_values(time_col)
        split = int(train_frac * len(group))

        train_idx.extend(group.index[:split])
        test_idx.extend(group.index[split:])
    
    # DataFrames / Series splitten
    X_weather_train = X_weather.loc[train_idx].copy()
    X_weather_test  = X_weather.loc[test_idx].copy()

    X_forecast_train = X_forecast.loc[train_idx].copy()
    X_forecast_test  = X_forecast.loc[test_idx].copy()

    y_train = y.loc[train_idx].copy()
    y_test  = y.loc[test_idx].copy()

    assert set(X_weather_train[site_col]) == set(X_weather_test[site_col])
    assert set(X_forecast_train[site_col]) == set(X_forecast_test[site_col])

    assert len(y_train) == len(X_weather_train) == len(X_forecast_train)
    assert len(y_test) == len(X_weather_test) == len(X_forecast_test)

    return X_weather_train, X_weather_test, X_forecast_train, X_forecast_test, y_train, y_test

In [90]:
# mean und std berechnen nur auf trainingsdaten und dann auf alle Daten anwenden

# Iteriere über alle Städte
# berechne je Stadt normierte Wetterdaten
# speichere für jede Stadt die entsprechenden Daten in einem Dictionary

prepared_data_dict = {}

for city in cities:

    city_data = residual_data[residual_data['city'] == city]
    w = weather_data[weather_data['city'] == city].copy()

    # merge with weather data by city and date (in weather data) and timestamp (in city_data)
    merged_data = pd.merge(city_data, w, left_on=['city', city_data['timestamp']], right_on=['city', w['date']], how='left')

    # Features und Zielvariable definieren
    feature_cols_weather = ['timestamp_number', 'id', 'temperature_2m', 'rain', 'snowfall', 'weekday', 'hour'] + [f'id_{i}' for i in range(num_ids)]
    feature_cols_forecast = ['timestamp_number', 'id', 'forecast_temperature_2m', 'forecast_rain', 'forecast_snowfall', 'weekday', 'hour'] + [f'id_{i}' for i in range(num_ids)]
    X_weather = merged_data[feature_cols_weather].astype(float)
    X_forecast = merged_data[feature_cols_forecast].astype(float)
    y = merged_data['year_plus_res'] 


    # drop rows with NaN values in X or y
    valid_idx = (y.notna() &
                X_weather.notna().all(axis=1) &
                X_forecast.notna().all(axis=1)
                )
    X_weather = X_weather.loc[valid_idx].copy()
    X_forecast = X_forecast.loc[valid_idx].copy()
    y = y.loc[valid_idx].copy()


    X_weather_train, X_weather_test, X_forecast_train, X_forecast_test, y_train, y_test = split_data(X_weather, X_forecast, y)
    X_weather_normalized = X_weather.copy()
    X_forecast_normalized = X_forecast.copy()
    city_means_stds = []

    # Normalisiere Wetterdaten
    for col in ['temperature_2m', 'rain', 'snowfall']:
        mean = X_weather_train[col].mean()
        std = X_weather_train[col].std()
        city_means_stds.append((col, mean, std))

        X_weather_train[col] = (X_weather_train[col] - mean) / std
        X_weather_test[col]  = (X_weather_test[col] - mean) / std

        X_weather_normalized[col] = (X_weather_normalized[col] - mean) / std

    for col in ['forecast_temperature_2m', 'forecast_rain', 'forecast_snowfall']:
        mean = X_forecast_train[col].mean()
        std = X_forecast_train[col].std()
        city_means_stds.append((col, mean, std))

        X_forecast_train[col] = (X_forecast_train[col] - mean) / std
        X_forecast_test[col]  = (X_forecast_test[col] - mean) / std

        X_forecast_normalized[col] = (X_forecast_normalized[col] - mean) / std


    # speichere Featurenames:
    feature_names_weather = X_weather_train.columns.tolist()
    feature_names_forecast = X_forecast_train.columns.tolist()

    # convert to numpy arrays
    X_weather_train = X_weather_train.values
    X_weather_test  = X_weather_test.values

    X_forecast_train = X_forecast_train.values
    X_forecast_test = X_forecast_test.values

    y_train = y_train.values
    y_test  = y_test.values

    prepared_data_dict[city] = {
        'X_weather_train': X_weather_train,
        'X_weather_test': X_weather_test,
        'X_forecast_train': X_forecast_train,
        'X_forecast_test': X_forecast_test,
        'y_train': y_train,
        'y_test': y_test,
        'feature_cols_weather': feature_names_weather,
        'feature_cols_forecast': feature_names_forecast, 
        'X_weather_normalized': X_weather_normalized,
        'X_forecast_normalized': X_forecast_normalized,
        'city_means_stds': city_means_stds
    
    }

    assert len(X_weather_train) == len(y_train)
    assert len(X_weather_test) == len(y_test)
    assert len(X_forecast_train) == len(y_train)
    assert len(X_forecast_test) == len(y_test)  
    assert len(X_weather_train) + len(X_weather_test) == len(X_weather_normalized)
    assert len(X_forecast_train) + len(X_forecast_test) == len(X_forecast_normalized)
    assert len(city_means_stds) == 6  # 3 weather + 3 forecast features, each with mean and std

In [91]:
print(prepared_data_dict['Stadt Tübingen']['feature_cols_weather'])

['timestamp_number', 'id', 'temperature_2m', 'rain', 'snowfall', 'weekday', 'hour', 'id_0', 'id_1', 'id_2', 'id_3', 'id_4', 'id_5', 'id_6', 'id_7', 'id_8', 'id_9', 'id_10', 'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'id_39', 'id_40', 'id_41', 'id_42', 'id_43', 'id_44', 'id_45', 'id_46', 'id_47', 'id_48', 'id_49', 'id_50', 'id_51', 'id_52', 'id_53', 'id_54', 'id_55', 'id_56', 'id_57', 'id_58', 'id_59', 'id_60', 'id_61', 'id_62', 'id_63', 'id_64', 'id_65', 'id_66', 'id_67', 'id_68', 'id_69', 'id_70', 'id_71', 'id_72', 'id_73', 'id_74', 'id_75', 'id_76']


In [92]:
print(prepared_data_dict['Stadt Tübingen']['X_weather_train'].shape)
print(prepared_data_dict['Stadt Tübingen']['X_weather_test'].shape)
print(X_weather_normalized.shape)
#print(X_weather_final.shape)
#print(X_forecast_final.shape)

(67375, 84)
(16845, 84)
(84220, 84)


# Natalies Modell

In [106]:
#cities = ["Stadt Tübingen"]#, "Stadt Heidelberg"]
metrics_dict = {}
city_models = {"observed": {}, "forecast": {}}

# iteriere über alle Städte
# trainiere jeweils ein GAM auf allen Daten der Stadt, auf den tatsächlichen Wetterdaten und auf den Wettervorhersagen
# berechne Metriken und speichere sie in einem Dictionary
for city in cities:
    X_weather_train = prepared_data_dict[city]['X_weather_train']
    X_weather_test  = prepared_data_dict[city]['X_weather_test']
    
    X_forecast_train = prepared_data_dict[city]['X_forecast_train']
    X_forecast_test  = prepared_data_dict[city]['X_forecast_test']
    
    y_train = prepared_data_dict[city]['y_train']
    y_test  = prepared_data_dict[city]['y_test']

    # Reihenfolge der Columns in X_both_...: 
    # 'timestamp_number', 'id', 'temperature_2m', 'rain', 'snowfall', 'forecast_temperature_2m', 'forecast_rain', 'forecast_snowfall', 'weekday', 'hour', 'id_1', 'id_2', 'id_3', 'id_4', ...
    X_both_train = np.concatenate([X_weather_train[:, :5], X_forecast_train[:, 2:5], X_weather_train[:, 5:]], axis=1)
    X_both_test = np.concatenate([X_weather_test[:, :5], X_forecast_test[:, 2:5], X_weather_test[:, 5:]], axis=1)
    
    #Assertion: X_both train soll 3 zusätzliche Columns haben im Vergleich zu X_weather_train
    assert X_both_train.shape[1] == X_weather_train.shape[1] + 3


    # GAM Modell definieren
    gam_1_formula = (#f(1) +                        # ID als Faktorvariable
        # Wetterdaten
        s(2, n_splines=6) +           # temp
        s(3, n_splines=6) +           # rain
        s(4, n_splines=6) +           # snow

        # Interaktionen mit Zeit
        te(2, 3, n_splines=6) +                             # temp × rain
        te(2, 5, n_splines=[6, 7], basis=['ps', 'cp']) +    # temp × weekday (zyklisch)
        te(3, 5, n_splines=[6, 7], basis=['ps', 'cp']) +    # rain × weekday (zyklisch)
        te(2, 6, n_splines=[6, 7], basis=['ps', 'cp']) +    # temp × hour (zyklisch)
        te(3, 6, n_splines=[6, 7], basis=['ps', 'cp']))    # rain × hour (zyklisch)

    #ID spezifische terme hinzufügen
    """gam_1_first_id_col = 7 
    for i in range(num_ids):
        gam_1_formula += s(2, by=gam_1_first_id_col + i, n_splines=4) # temp nach ID
        gam_1_formula += s(3, by=gam_1_first_id_col + i, n_splines=4) # rain nach ID"""
    gam_1 = LinearGAM(
        gam_1_formula
        )
    gam_1.gridsearch(X_weather_train, y_train)

       

    gam_2 = LinearGAM(
        gam_1_formula
        )
    gam_2.gridsearch(X_forecast_train, y_train)

    gam_3_formula = (#f(1) +                        # ID als Faktorvariable
        # Ist-Wetterdaten
        s(2, n_splines=6) +           # temp
        s(3, n_splines=6) +           # rain
        s(4, n_splines=6) +           # snow
        # Forecast-Wetterdaten
        s(5, n_splines=6) +           # forecast_temp
        s(6, n_splines=6) +           # forecast_rain
        s(7, n_splines=6) +           # forecast_snow
        # Interaktionen mit Zeit
        te(2, 3, n_splines=6) +                             # temp × rain
        te(2, 8, n_splines=[6, 7], basis=['ps', 'cp']) +    # temp × weekday (zyklisch)
        te(3, 8, n_splines=[6, 7], basis=['ps', 'cp']) +    # rain × weekday (zyklisch)
        te(2, 9, n_splines=[6, 7], basis=['ps', 'cp']) +    # temp × hour (zyklisch)
        te(3, 9, n_splines=[6, 7], basis=['ps', 'cp']) +    # rain × hour (zyklisch)
        # Interaktionen mit der Zeit forecast
        te(5, 6, n_splines=6) +                             # forecast_temp × forecast_rain
        te(5, 8, n_splines=[6, 7], basis=['ps', 'cp']) +    # forecast_temp × weekday (zyklisch)
        te(6, 8, n_splines=[6, 7], basis=['ps', 'cp']) +    # forecast_rain × weekday (zyklisch)
        te(5, 9, n_splines=[6, 7], basis=['ps', 'cp']) +    # forecast_temp × hour (zyklisch)
        te(6, 9, n_splines=[6, 7], basis=['ps', 'cp']) #+    # forecast_rain × hour (zyklisch)
    )
    """gam_3_first_id_col = 10 
    for i in range(num_ids):
        gam_3_formula += s(2, by=gam_1_first_id_col + i, n_splines=4) # temp nach ID
        gam_3_formula += s(3, by=gam_1_first_id_col + i, n_splines=4) # rain nach ID
        gam_3_formula += s(5, by=gam_1_first_id_col + i, n_splines=4) # forecast_temp nach ID
        gam_3_formula += s(6, by=gam_1_first_id_col + i, n_splines=4) # forecast_rain nach ID"""
        
    gam_3 = LinearGAM(
        gam_3_formula
        )
    gam_3.gridsearch(X_both_train, y_train)

    # ------------------------------------------------
    # save models for plots
    city_models["observed"][city] = {
        "gam": gam_1,
        "X": prepared_data_dict[city]["X_weather_normalized"],
        "y": y,
        "city_means_stds": prepared_data_dict[city]["city_means_stds"][0:3]
    }
    city_models["forecast"][city] = {
        "gam": gam_2,
        "X": prepared_data_dict[city]["X_forecast_normalized"],
        "y": y, 
        "city_means_stds": prepared_data_dict[city]["city_means_stds"][3:6]
    }

    # ------------------------------------------------
    # berechne Metriken und speichere sie in einem Dictionary

    stats_1 = gam_1.statistics_
    y_test_hat_1 = gam_1.predict(X_weather_test)
    y_train_hat_1 = gam_1.predict(X_weather_train)

    metrics_dict[city + ' weather'] = {
        'lambdas': gam_1.lam,
        'AIC': stats_1.get('AIC'),
        'edof': stats_1.get('edof'),
        'GCV': stats_1.get('GCV'),
        #'pseudo_R2': stats_1.get('pseudo_R2').get('explained_deviance'),
        'r2_train': r2_score(y_train, y_train_hat_1),
        'r2_test': r2_score(y_test, y_test_hat_1),
        'rmse_train': np.sqrt(mean_squared_error(y_train, y_train_hat_1)),
        'rmse_test': np.sqrt(mean_squared_error(y_test, y_test_hat_1)),
        'y_train_hat': y_train_hat_1,
        'y_test_hat': y_test_hat_1
    }

    stats_2 = gam_2.statistics_
    y_test_hat_2 = gam_2.predict(X_forecast_test)
    y_train_hat_2 = gam_2.predict(X_forecast_train)

    metrics_dict[city + ' forecast'] = {
        'lambdas': gam_2.lam,
        'AIC': stats_2.get('AIC'),
        'edof': stats_2.get('edof'),
        'GCV': stats_2.get('GCV'),
        #'pseudo_R2': stats_2.get('pseudo_R2').get('explained_deviance'),
        'r2_train': r2_score(y_train, y_train_hat_2),
        'r2_test': r2_score(y_test, y_test_hat_2),
        'rmse_train': np.sqrt(mean_squared_error(y_train, y_train_hat_2)),
        'rmse_test': np.sqrt(mean_squared_error(y_test, y_test_hat_2)),
        'y_train_hat': y_train_hat_2,
        'y_test_hat': y_test_hat_2
    }
    
    stats_3 = gam_3.statistics_
    y_test_hat_3 = gam_3.predict(X_both_test)
    y_train_hat_3 = gam_3.predict(X_both_train)

    metrics_dict[city + ' both'] = {
        'lambdas': gam_3.lam,
        'AIC': stats_3.get('AIC'),
        'edof': stats_3.get('edof'),
        'GCV': stats_3.get('GCV'),
       # 'pseudo_R2': stats_3.get('pseudo_R2').get('explained_deviance'),
        'r2_train': r2_score(y_train, y_train_hat_3),
        'r2_test': r2_score(y_test, y_test_hat_3),
        'rmse_train': np.sqrt(mean_squared_error(y_train, y_train_hat_3)),
        'rmse_test': np.sqrt(mean_squared_error(y_test, y_test_hat_3)),
        'y_train_hat': y_train_hat_3,
        'y_test_hat': y_test_hat_3
    }
    print(f"Completed GAM training and metrics calculation for city: {city}")
    # 'timestamp_number', 'id', 'temperature_2m', 'rain', 'snowfall', 'forecast_temperature_2m', 'forecast_rain', 'forecast_snowfall', 'weekday', 'hour', 'id_1', 'id_2', 'id_3', 'id_4', ...
    

  0% (0 of 11) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
  9% (1 of 11) |##                       | Elapsed Time: 0:00:22 ETA:   0:03:47
 18% (2 of 11) |####                     | Elapsed Time: 0:00:42 ETA:   0:03:11
 27% (3 of 11) |######                   | Elapsed Time: 0:01:00 ETA:   0:02:41
 36% (4 of 11) |#########                | Elapsed Time: 0:01:18 ETA:   0:02:18
 45% (5 of 11) |###########              | Elapsed Time: 0:01:37 ETA:   0:01:57
 54% (6 of 11) |#############            | Elapsed Time: 0:01:57 ETA:   0:01:37
 63% (7 of 11) |###############          | Elapsed Time: 0:02:16 ETA:   0:01:18
 72% (8 of 11) |##################       | Elapsed Time: 0:02:35 ETA:   0:00:58
 81% (9 of 11) |####################     | Elapsed Time: 0:02:55 ETA:   0:00:38
 90% (10 of 11) |#####################   | Elapsed Time: 0:03:15 ETA:   0:00:19
100% (11 of 11) |########################| Elapsed Time: 0:03:35 Time:  0:03:35
  0% (0 of 11) |                        

Completed GAM training and metrics calculation for city: Landeshauptstadt Stuttgart


  9% (1 of 11) |##                       | Elapsed Time: 0:00:03 ETA:   0:00:35
 18% (2 of 11) |####                     | Elapsed Time: 0:00:07 ETA:   0:00:34
 27% (3 of 11) |######                   | Elapsed Time: 0:00:11 ETA:   0:00:30
 36% (4 of 11) |#########                | Elapsed Time: 0:00:15 ETA:   0:00:27
 45% (5 of 11) |###########              | Elapsed Time: 0:00:19 ETA:   0:00:23
 54% (6 of 11) |#############            | Elapsed Time: 0:00:23 ETA:   0:00:19
 63% (7 of 11) |###############          | Elapsed Time: 0:00:27 ETA:   0:00:15
 72% (8 of 11) |##################       | Elapsed Time: 0:00:31 ETA:   0:00:11
 81% (9 of 11) |####################     | Elapsed Time: 0:00:35 ETA:   0:00:07
 90% (10 of 11) |#####################   | Elapsed Time: 0:00:39 ETA:   0:00:03
100% (11 of 11) |########################| Elapsed Time: 0:00:43 Time:  0:00:43
  0% (0 of 11) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
  9% (1 of 11) |##                      

Completed GAM training and metrics calculation for city: Stadt Freiburg


  9% (1 of 11) |##                       | Elapsed Time: 0:00:09 ETA:   0:01:38
 18% (2 of 11) |####                     | Elapsed Time: 0:00:20 ETA:   0:01:30
 27% (3 of 11) |######                   | Elapsed Time: 0:00:30 ETA:   0:01:20
 36% (4 of 11) |#########                | Elapsed Time: 0:00:40 ETA:   0:01:10
 45% (5 of 11) |###########              | Elapsed Time: 0:00:50 ETA:   0:01:00
 54% (6 of 11) |#############            | Elapsed Time: 0:01:00 ETA:   0:00:50
 63% (7 of 11) |###############          | Elapsed Time: 0:01:10 ETA:   0:00:40
 72% (8 of 11) |##################       | Elapsed Time: 0:01:20 ETA:   0:00:30
 81% (9 of 11) |####################     | Elapsed Time: 0:01:31 ETA:   0:00:20
 90% (10 of 11) |#####################   | Elapsed Time: 0:01:41 ETA:   0:00:10
100% (11 of 11) |########################| Elapsed Time: 0:01:51 Time:  0:01:51
  0% (0 of 11) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
  9% (1 of 11) |##                      

Completed GAM training and metrics calculation for city: Stadt Heidelberg


  0% (0 of 11) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
  9% (1 of 11) |##                       | Elapsed Time: 0:00:17 ETA:   0:02:58
 18% (2 of 11) |####                     | Elapsed Time: 0:00:35 ETA:   0:02:39
 27% (3 of 11) |######                   | Elapsed Time: 0:00:53 ETA:   0:02:22
 36% (4 of 11) |#########                | Elapsed Time: 0:01:10 ETA:   0:02:04
 45% (5 of 11) |###########              | Elapsed Time: 0:01:28 ETA:   0:01:46
 54% (6 of 11) |#############            | Elapsed Time: 0:01:46 ETA:   0:01:28
 63% (7 of 11) |###############          | Elapsed Time: 0:02:03 ETA:   0:01:10
 72% (8 of 11) |##################       | Elapsed Time: 0:02:21 ETA:   0:00:53
 81% (9 of 11) |####################     | Elapsed Time: 0:02:39 ETA:   0:00:35
 90% (10 of 11) |#####################   | Elapsed Time: 0:02:57 ETA:   0:00:17
100% (11 of 11) |########################| Elapsed Time: 0:03:14 Time:  0:03:14
  0% (0 of 11) |                        

Completed GAM training and metrics calculation for city: Stadt Ludwigsburg


  9% (1 of 11) |##                       | Elapsed Time: 0:00:11 ETA:   0:01:51
 18% (2 of 11) |####                     | Elapsed Time: 0:00:22 ETA:   0:01:40
 27% (3 of 11) |######                   | Elapsed Time: 0:00:33 ETA:   0:01:29
 36% (4 of 11) |#########                | Elapsed Time: 0:00:44 ETA:   0:01:18
 45% (5 of 11) |###########              | Elapsed Time: 0:00:55 ETA:   0:01:06
 54% (6 of 11) |#############            | Elapsed Time: 0:01:07 ETA:   0:00:55
 63% (7 of 11) |###############          | Elapsed Time: 0:01:18 ETA:   0:00:44
 72% (8 of 11) |##################       | Elapsed Time: 0:01:29 ETA:   0:00:33
 81% (9 of 11) |####################     | Elapsed Time: 0:01:40 ETA:   0:00:22
 90% (10 of 11) |#####################   | Elapsed Time: 0:01:51 ETA:   0:00:11
100% (11 of 11) |########################| Elapsed Time: 0:02:03 Time:  0:02:03
  0% (0 of 11) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
  9% (1 of 11) |##                      

Completed GAM training and metrics calculation for city: Stadt Mannheim


  9% (1 of 11) |##                       | Elapsed Time: 0:00:10 ETA:   0:01:42
 18% (2 of 11) |####                     | Elapsed Time: 0:00:20 ETA:   0:01:33
 27% (3 of 11) |######                   | Elapsed Time: 0:00:31 ETA:   0:01:23
 36% (4 of 11) |#########                | Elapsed Time: 0:00:41 ETA:   0:01:12
 45% (5 of 11) |###########              | Elapsed Time: 0:00:52 ETA:   0:01:02
 54% (6 of 11) |#############            | Elapsed Time: 0:01:02 ETA:   0:00:52
 63% (7 of 11) |###############          | Elapsed Time: 0:01:13 ETA:   0:00:41
 72% (8 of 11) |##################       | Elapsed Time: 0:01:23 ETA:   0:00:31
 81% (9 of 11) |####################     | Elapsed Time: 0:01:34 ETA:   0:00:20
 90% (10 of 11) |#####################   | Elapsed Time: 0:01:44 ETA:   0:00:10
100% (11 of 11) |########################| Elapsed Time: 0:01:54 Time:  0:01:54
  0% (0 of 11) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
  9% (1 of 11) |##                      

Completed GAM training and metrics calculation for city: Stadt Reutlingen


  9% (1 of 11) |##                       | Elapsed Time: 0:00:02 ETA:   0:00:26
 18% (2 of 11) |####                     | Elapsed Time: 0:00:05 ETA:   0:00:24
 27% (3 of 11) |######                   | Elapsed Time: 0:00:08 ETA:   0:00:22
 36% (4 of 11) |#########                | Elapsed Time: 0:00:11 ETA:   0:00:19
 45% (5 of 11) |###########              | Elapsed Time: 0:00:14 ETA:   0:00:16
 54% (6 of 11) |#############            | Elapsed Time: 0:00:16 ETA:   0:00:14
 63% (7 of 11) |###############          | Elapsed Time: 0:00:19 ETA:   0:00:11
 72% (8 of 11) |##################       | Elapsed Time: 0:00:22 ETA:   0:00:08
 81% (9 of 11) |####################     | Elapsed Time: 0:00:25 ETA:   0:00:05
 90% (10 of 11) |#####################   | Elapsed Time: 0:00:28 ETA:   0:00:02
100% (11 of 11) |########################| Elapsed Time: 0:00:30 Time:  0:00:30
  0% (0 of 11) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
  9% (1 of 11) |##                      

Completed GAM training and metrics calculation for city: Stadt Tübingen


In [107]:
import copy 
import json

metrics_json = copy.deepcopy(metrics_dict)
# dann nur metrics_json umwandeln

for city_key, city_metrics in metrics_json.items():
    for k in ["y_train_hat", "y_test_hat"]:
        if k in city_metrics and isinstance(city_metrics[k], np.ndarray):
            city_metrics[k] = city_metrics[k].tolist()

with open("metrics.json", "w") as f:
    json.dump(metrics_json, f, indent=4)            

In [108]:
from utils.gam_result_utils import save_temp_effect_results, save_rain_effect_results

# Expected structure of city_models:## city_models = {
#     "observed": {
#         "<city_name>": {
#             "gam": <trained GAM model>,
#             "X": np.ndarray of shape (n_samples, n_features),
#             "y": np.ndarray of shape (n_samples,)   # optional for plotting
#         },
#         ...
#     },
#     "forecast": {
#         "<city_name>": {
#             "gam": <trained GAM model>,
#             "X": np.ndarray of shape (n_samples, n_features),
#             "y": np.ndarray of shape (n_samples,)
# optional for plotting
#         },
#         ...
#     }
# }
save_temp_effect_results(city_models, out_path="../data/temp_effect_results.pkl", temp_col=2, rain_col=3, snow_col=4)
save_rain_effect_results(city_models, out_path="../data/rain_effect_results.pkl")

  X_base[temp_col] = global_mean_temp
  X_base[rain_col] = 0.0
  X_base[snow_col] = snow_fixed


Saved temperature effect results to ../data/temp_effect_results.pkl


  X_base[snow_col] = snow_fixed
  X_base_temp[temp_col] = T
  X_base_temp[rain_col] = 0.0


Saved rain effect results to ../data/rain_effect_results.pkl


In [109]:
# print overview of metrics
for key, value in metrics_dict.items():
    print(f"{key}:")
    for metric, metric_value in value.items():
        if metric != 'lambdas':  # Skip printing lambdas for brevity
            print(f"  {metric}: {metric_value}")
    print()

Landeshauptstadt Stuttgart weather:
  AIC: 614946.2840563246
  edof: 119.93960017903817
  GCV: 0.2041253096588796
  r2_train: 0.37816449130177343
  r2_test: 0.3527809122881794
  rmse_train: 0.4516482453786669
  rmse_test: 0.4294243863758076
  y_train_hat: [-0.26788352 -0.25435579 -0.23409535 ... -0.14340439 -0.11467734
 -0.18307692]
  y_test_hat: [-0.46610757 -0.49812007 -0.52679792 ... -0.35101254 -0.33837473
 -0.32594707]

Landeshauptstadt Stuttgart forecast:
  AIC: 608341.1053476667
  edof: 122.23510968243679
  GCV: 0.20140661144798505
  r2_train: 0.38645458613783357
  r2_test: 0.36187298144589175
  rmse_train: 0.4486275351319341
  rmse_test: 0.42639746283368685
  y_train_hat: [-0.32747775 -0.30711377 -0.28518289 ... -0.20230028 -0.13160007
 -0.17392469]
  y_test_hat: [-0.42882124 -0.44773713 -0.47154502 ... -0.36725169 -0.34635137
 -0.3410275 ]

Landeshauptstadt Stuttgart both:
  AIC: 590333.9756577432
  edof: 219.14354524829122
  GCV: 0.19420600257334672
  r2_train: 0.408715845313

In [111]:
"""# print all lambdas
for key, value in metrics_dict.items():
    print(f"{key} lambdas: {value['lambdas']}")"""

'# print all lambdas\nfor key, value in metrics_dict.items():\n    print(f"{key} lambdas: {value[\'lambdas\']}")'

In [112]:
# Alle Metriken als Tabelle anzeigen
metrics_rows = []

# ein Dataframe mit allen Metriken erstellen
for key, value in metrics_dict.items():
    row = {'Model': key}
    for metric, metric_value in value.items():
        # Überspringe die Predictions
        if metric not in ['y_train_hat', 'y_test_hat', 'lambdas']:
            row[metric] = metric_value
    metrics_rows.append(row)

metrics_df = pd.DataFrame(metrics_rows)

# Sortiere nach Stadt und Modelltyp
metrics_df = metrics_df.sort_values('Model')

# Runde numerische Werte für bessere Lesbarkeit
numeric_cols = metrics_df.select_dtypes(include=[np.number]).columns
metrics_df[numeric_cols] = metrics_df[numeric_cols].round(4)

# Zeige die Tabelle
display(metrics_df)

Unnamed: 0,Model,AIC,edof,GCV,r2_train,r2_test,rmse_train,rmse_test
2,Landeshauptstadt Stuttgart both,590333.9757,219.1435,0.1942,0.4087,0.3653,0.4404,0.4253
1,Landeshauptstadt Stuttgart forecast,608341.1053,122.2351,0.2014,0.3865,0.3619,0.4486,0.4264
0,Landeshauptstadt Stuttgart weather,614946.2841,119.9396,0.2041,0.3782,0.3528,0.4516,0.4294
5,Stadt Freiburg both,61476.865,203.2385,0.1137,0.281,0.124,0.3362,0.3249
4,Stadt Freiburg forecast,63786.8252,112.9835,0.1165,0.2615,0.1312,0.3407,0.3236
3,Stadt Freiburg weather,63334.3157,111.6192,0.1159,0.2651,0.1507,0.3399,0.3199
8,Stadt Heidelberg both,209783.448,218.3035,0.1351,0.2478,0.0661,0.3671,0.6401
7,Stadt Heidelberg forecast,216048.1437,121.4061,0.1385,0.2282,0.0593,0.3719,0.6424
6,Stadt Heidelberg weather,214889.3921,119.9945,0.1378,0.2317,0.0613,0.371,0.6417
11,Stadt Ludwigsburg both,491878.3229,218.1116,0.1779,0.3768,0.3399,0.4215,0.4124


In [113]:
# formatierte Ausgabe vom metrics data frame
print(metrics_df.to_string(index=False))

                              Model         AIC     edof    GCV  r2_train  r2_test  rmse_train  rmse_test
    Landeshauptstadt Stuttgart both 590333.9757 219.1435 0.1942    0.4087   0.3653      0.4404     0.4253
Landeshauptstadt Stuttgart forecast 608341.1053 122.2351 0.2014    0.3865   0.3619      0.4486     0.4264
 Landeshauptstadt Stuttgart weather 614946.2841 119.9396 0.2041    0.3782   0.3528      0.4516     0.4294
                Stadt Freiburg both  61476.8650 203.2385 0.1137    0.2810   0.1240      0.3362     0.3249
            Stadt Freiburg forecast  63786.8252 112.9835 0.1165    0.2615   0.1312      0.3407     0.3236
             Stadt Freiburg weather  63334.3157 111.6192 0.1159    0.2651   0.1507      0.3399     0.3199
              Stadt Heidelberg both 209783.4480 218.3035 0.1351    0.2478   0.0661      0.3671     0.6401
          Stadt Heidelberg forecast 216048.1437 121.4061 0.1385    0.2282   0.0593      0.3719     0.6424
           Stadt Heidelberg weather 214889.392

Mean und std für alle 3 Modelle über die Städte hinweg bilden für alle Metriken

In [114]:
# Erstelle ein DataFrame mit Mean und Std für alle 3 Modelle über Städte hinweg
metrics_df['City'] = metrics_df['Model'].str.rsplit(' ', n=1).str[0]
metrics_df['Type'] = metrics_df['Model'].str.rsplit(' ', n=1).str[1]

# Gruppiere nach Modelltyp und berechne Mean und Std
summary_stats = []

for model_type in ['weather', 'forecast', 'both']:
    model_data = metrics_df[metrics_df['Type'] == model_type]
    
    # Berechne Mean und Std für jede Metrik
    for metric in metrics_df.columns:
        if metric not in ['Model', 'City', 'Type']:
            mean_val = model_data[metric].mean()
            std_val = model_data[metric].std()
            
            summary_stats.append({
                'Model_Type': model_type,
                'Metric': metric,
                'Mean': mean_val,
                'Std': std_val
            })

summary_df = pd.DataFrame(summary_stats)

# Runde auf 4 Dezimalstellen
summary_df['Mean'] = summary_df['Mean'].round(4)
summary_df['Std'] = summary_df['Std'].round(4)

# Zeige die Zusammenfassung
display(summary_df)



Unnamed: 0,Model_Type,Metric,Mean,Std
0,weather,AIC,287297.8635,211799.0435
1,weather,edof,119.6535,3.7584
2,weather,GCV,0.1586,0.0336
3,weather,r2_train,0.2953,0.0636
4,weather,r2_test,0.1969,0.1319
5,weather,rmse_train,0.396,0.0423
6,weather,rmse_test,0.4487,0.1036
7,forecast,AIC,285754.5578,208805.3925
8,forecast,edof,123.0082,5.094
9,forecast,GCV,0.1583,0.0325


In [115]:
# Pivot-Tabelle für bessere Lesbarkeit
pivot_summary = summary_df.pivot_table(index='Metric', columns='Model_Type', values=['Mean', 'Std'])
print("\nZusammenfassung Mean und Std nach Modelltyp:")
print(pivot_summary.round(4))


Zusammenfassung Mean und Std nach Modelltyp:
                   Mean                                    Std               \
Model_Type         both     forecast      weather         both     forecast   
Metric                                                                        
AIC         277620.2098  285754.5578  287297.8635  202777.2973  208805.3925   
GCV              0.1537       0.1583       0.1586       0.0311       0.0325   
edof           216.3908     123.0082     119.6535       7.0564       5.0940   
r2_test          0.1874       0.1831       0.1969       0.1596       0.1535   
r2_train         0.3165       0.2960       0.2953       0.0688       0.0672   
rmse_test        0.4506       0.4521       0.4487       0.1059       0.1059   
rmse_train       0.3897       0.3956       0.3960       0.0400       0.0411   

                         
Model_Type      weather  
Metric                   
AIC         211799.0435  
GCV              0.0336  
edof             3.7584  
r2_test