## XGBoost Modelltraining auf finalem Datensatz - MIT Inflationsdaten

Das Notebook konnte aus Zeitgründen nicht fortgeführt werden

In [19]:
import pandas as pd
import os

file_path = os.path.join( "..", "Daten",  "Merged_BSR_24Uhr_23mittel.csv")

df = pd.read_csv(file_path)  
df.tail()

Unnamed: 0,Schicht,Tour,Tonnage,Temperature_Max (°C),Rain_Sum (mm),Snowfall_Sum (cm),Wind_Speed_Max (km/h),Daylight_Duration (s),Temperature_Max (°C) 3-Day Avg,Rain_Sum (mm) 3-Day Avg,...,Feiertag_Pfingstmontag_shift_1,Feiertag_Pfingstmontag_shift_2,Feiertag_Tag der Arbeit_shift_1,Feiertag_Tag der Arbeit_shift_2,Feiertag_Tag der Befreiung_shift_1,Feiertag_Tag der Befreiung_shift_2,Feiertag_Tag der Deutschen Einheit_shift_1,Feiertag_Tag der Deutschen Einheit_shift_2,Feiertag_Zweiter Weihnachtstag_shift_1,Feiertag_Zweiter Weihnachtstag_shift_2
381656,1,5508,2.9,8.5,2.6,0.0,29.8,27757.76,9.8,3.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
381657,1,2101,7.86,8.5,2.6,0.0,29.8,27757.76,9.8,3.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
381658,1,2102,1.81,8.5,2.6,0.0,29.8,27757.76,9.8,3.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
381659,1,3301,0.35,8.5,2.6,0.0,29.8,27757.76,9.8,3.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
381660,1,3302,4.13,8.5,2.6,0.0,29.8,27757.76,9.8,3.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Löschen aller Zeilen des Jahres 2019**  
Da Inflationsdaten nur für 2020-2023 vorhanden sind, wird das Jahr 2019 für das Training entfernt.

In [12]:
df = df.drop(df[df["year"] == 2019].index)
print(df["year"].unique())

[2020 2021 2022 2023]


-----------------------
**Vorbereitung der Daten für Training**

In [13]:
X = df.drop(columns=['Tonnage'])  
y = df['Tonnage']  

In [14]:
X = X.drop(columns=['year']) 

**Splitten der Daten**

In [15]:
# Anteile für die Aufteilung
train_size = int(len(X) * 0.7)  # 70% Training
val_size = int(len(X) * 0.15)   # 15% Validation
test_size = len(X) - train_size - val_size  # 15% Test

# Train/Val/Test anhand der Zeitachse splitten
X_train, y_train = X.iloc[:train_size], y.iloc[:train_size]
X_val, y_val = X.iloc[train_size:train_size + val_size], y.iloc[train_size:train_size + val_size]
X_test, y_test = X.iloc[train_size + val_size:], y.iloc[train_size + val_size:]

print(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")


Train: (217030, 89), Validation: (46506, 89), Test: (46508, 89)


---
**Modelltraining mit TimSeriesSplit**

In [20]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Beispiel: CSV-Datei einlesen
df = pd.read_csv(file_path)

# Zielvariable und Features definieren
X = df.drop(columns=['Tonnage', 'year'])  # Jahrspalte nach der ersten Aufteilung bereits entfernt
y = df['Tonnage']

# Aufteilung in Train/Val/Test (bisher korrekt)
train_size = int(len(X) * 0.7)  # 70% Training
val_size = int(len(X) * 0.15)   # 15% Validation
test_size = len(X) - train_size - val_size  # 15% Test

X_train, y_train = X.iloc[:train_size], y.iloc[:train_size]
X_val, y_val = X.iloc[train_size:train_size + val_size], y.iloc[train_size:train_size + val_size]
X_test, y_test = X.iloc[train_size + val_size:], y.iloc[train_size + val_size:]

print(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")

# TimeSeriesSplit definieren (z. B. 5-fache Kreuzvalidierung)
tscv = TimeSeriesSplit(n_splits=5)

# XGBoost-Regressor definieren
model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Listen für Metriken, um sie zu sammeln
mae_list = []
mse_list = []
rmse_list = []
r2_list = []

# Cross-Validation-Schleife mit TimeSeriesSplit
for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train)):
    # Aufteilen der Daten in Trainings- und Validierungsdaten für dieses Fold
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Modell auf dem Trainings-Fold trainieren
    model.fit(X_train_fold, y_train_fold)
    
    # Vorhersagen auf dem Validation-Fold
    y_val_pred = model.predict(X_val_fold)
    
    # Berechnung der verschiedenen Metriken
    mae = mean_absolute_error(y_val_fold, y_val_pred)
    mse = mean_squared_error(y_val_fold, y_val_pred)
    rmse = np.sqrt(mse)  # RMSE ist einfach die Quadratwurzel des MSE
    r2 = r2_score(y_val_fold, y_val_pred)
    
    # Metriken zur Liste hinzufügen
    mae_list.append(mae)
    mse_list.append(mse)
    rmse_list.append(rmse)
    r2_list.append(r2)
    
    # Ausgabe der Metriken für jedes Fold
    print(f"Fold {fold + 1}:")
    print(f"  MAE: {mae:.4f}")
    print(f"  MSE: {mse:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R² Score: {r2:.4f}")
    print("-" * 50)

# Berechnung des Durchschnitts der Metriken über alle Folds
avg_mae = np.mean(mae_list)
avg_mse = np.mean(mse_list)
avg_rmse = np.mean(rmse_list)
avg_r2 = np.mean(r2_list)

# Ausgabe der durchschnittlichen Metriken
print("Durchschnittliche Metriken über alle Folds:")
print(f"  Durchschnittlicher MAE: {avg_mae:.4f}")
print(f"  Durchschnittlicher MSE: {avg_mse:.4f}")
print(f"  Durchschnittlicher RMSE: {avg_rmse:.4f}")
print(f"  Durchschnittlicher R² Score: {avg_r2:.4f}")


Train: (267162, 89), Validation: (57249, 89), Test: (57250, 89)
Fold 1:
  MAE: 2.4956
  MSE: 10.8394
  RMSE: 3.2923
  R² Score: 0.6896
--------------------------------------------------
Fold 2:
  MAE: 2.5671
  MSE: 11.5021
  RMSE: 3.3915
  R² Score: 0.6697
--------------------------------------------------
Fold 3:
  MAE: 2.6882
  MSE: 12.3452
  RMSE: 3.5136
  R² Score: 0.6781
--------------------------------------------------
Fold 4:
  MAE: 2.4164
  MSE: 9.9641
  RMSE: 3.1566
  R² Score: 0.7106
--------------------------------------------------
Fold 5:
  MAE: 2.5009
  MSE: 10.5851
  RMSE: 3.2535
  R² Score: 0.6980
--------------------------------------------------
Durchschnittliche Metriken über alle Folds:
  Durchschnittlicher MAE: 2.5336
  Durchschnittlicher MSE: 11.0472
  Durchschnittlicher RMSE: 3.3215
  Durchschnittlicher R² Score: 0.6892
