<a href="https://colab.research.google.com/github/rjanow/Masterarbeit/blob/main/Modeling_and_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LSTM

In diesem Skript wird ein erstes Modell erzeugt, um aus den gemssenen Globalstrahlungsdaten den UVI zu berechen.

In [1]:
# Verbinden mit der Google-Drive
import os

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install tensorflow



In [3]:
import tensorflow.keras as keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Bidirectional
from sklearn.preprocessing import MinMaxScaler

In [4]:
# Pfad zur CSV-Datei auf Google Drive
name_Messwerte = 'Messdaten_CAMS_GHI.csv'
name_Vorhersage = 'Vorhersagedaten_CAMS_VarIdx.csv'
folder_import = '/content/drive/My Drive/Colab_Notebooks/Clean_Data/'

In [5]:
# Import der Messdaten
df_Messdaten = pd.read_csv(folder_import + name_Messwerte)

In [6]:
# Import der Vorhersagedaten
df_Vorhersage = pd.read_csv(folder_import + name_Vorhersage)

In [7]:
# Konvertiere die Zeitspalten in ein gemeinsames Format und führe einen Merge der Datensätze durch
df_Messdaten['Datetime'] = pd.to_datetime(df_Messdaten['Datetime'])
df_Vorhersage['Datetime'] = pd.to_datetime(df_Vorhersage['Datetime'])

In [8]:
df_Messdaten.columns

Index(['Datetime', 'Observation_period', 'Clear_sky_GHI', 'Clear_sky_BHI',
       'GHI', 'BHI', 'UVI', 'UVA', 'UVB', 'erythem', 'Datum', 'Uhrzeit',
       'Messzeitpunkt', 'ghi', 'Dif', 'Glo_SPLite', 'Dir', 'Temp',
       'DiffGreater2', 'SZA', 'time_sin', 'time_cos', 'date_sin', 'date_cos',
       'Date', 'Hour'],
      dtype='object')

In [9]:
# Relevante Spalten
indipendent_variablen = ['ghi', 'SZA', 'time_sin', 'time_cos', 'date_sin', 'date_cos']
dependent_variablen = ['UVI']

columns = indipendent_variablen + dependent_variablen

Diese Liste enthält die Spaltennamen der Merkmale, die als Inputs verwendet werden.

In [10]:
#Applying a scaler for regression
MMS = MinMaxScaler()
df_Messdaten[columns] = MMS.fit_transform(df_Messdaten[columns])

Die unabhängigen Variablen werden durch den MinMaxScaler skaliert.

In [11]:
#Splitting in training and testing samples
training_size = round(len(df_Messdaten) * 0.80)
train_data = df_Messdaten[columns][:training_size]
test_data  = df_Messdaten[columns][training_size:]

print(train_data, test_data)

            ghi       SZA  time_sin  time_cos  date_sin  date_cos       UVI
0      0.449763  0.394655  0.969096  0.403006  0.641161  0.020303  0.281919
1      0.452447  0.390193  0.969096  0.403006  0.641161  0.020303  0.281919
2      0.455624  0.385734  0.967568  0.397968  0.641161  0.020303  0.288240
3      0.458118  0.381277  0.966004  0.392947  0.641161  0.020303  0.290191
4      0.463850  0.376825  0.964405  0.387943  0.641161  0.020303  0.297739
...         ...       ...       ...       ...       ...       ...       ...
77381  0.342308  0.574718  0.058526  0.326979  0.971797  0.334079  0.093193
77382  0.152062  0.579150  0.056495  0.331739  0.971797  0.334079  0.078514
77383  0.125504  0.583587  0.054497  0.336521  0.971797  0.334079  0.070488
77384  0.108314  0.588028  0.052533  0.341323  0.971797  0.334079  0.067234
77385  0.095241  0.592473  0.050603  0.346147  0.971797  0.334079  0.062044

[77386 rows x 7 columns]             ghi       SZA  time_sin  time_cos  date_sin  date_

In [None]:
def save_validation_data(X_val, y_val, save_path, file_name='validation_data.csv'):
    """
    Speichert den Validierungsdatensatz als CSV-Datei.

    Parameters:
    - X_val: numpy.ndarray, Eingabedaten des Validierungsdatensatzes (3D-Array: Anzahl der Sequenzen x Sequenzlänge x Features)
    - y_val: numpy.ndarray, Zielwerte des Validierungsdatensatzes (1D-Array)
    - save_path: str, Verzeichnis, in dem die Datei gespeichert werden soll
    - file_name: str, Name der CSV-Datei (Standard: 'validation_data.csv')

    Returns:
    - full_path: str, vollständiger Pfad der gespeicherten Datei
    """
    # Stelle sicher, dass das Verzeichnis existiert
    os.makedirs(save_path, exist_ok=True)

    # Konvertiere X_val in ein flaches Format
    X_val_flat = X_val.reshape(X_val.shape[0], -1)  # (Anzahl der Sequenzen, Sequenzlänge * Features)

    # Kombiniere X_val und y_val in einen DataFrame
    validation_data = pd.DataFrame(X_val_flat)
    validation_data['y_val'] = y_val  # Zielwerte hinzufügen

    # Speichere die Datei als CSV
    full_path = os.path.join(save_path, file_name)
    validation_data.to_csv(full_path, index=False)

    print(f"Validation data saved to: {full_path}")
    return full_path

In [12]:
def create_sequence(dataset, sequence_len):
    sequences = []
    labels = []
    for start_idx in range(len(dataset) - sequence_len):
        stop_idx = start_idx + sequence_len
        # Sequenz (Fenster mit Länge sequence_len)
        sequences.append(dataset.iloc[start_idx:stop_idx].values)
        # Zielwert (nur die Zielspalte)
        labels.append(dataset.iloc[stop_idx]['UVI'])  # 'UVI' ist die Zielspalte
    return np.array(sequences), np.array(labels)

Hier werden die Eingabedaten (X_train und X_val) als Sequenzen mit einer Länge von 100 Schritten erstellt. Die endgültige Dimension der Input-Daten für das Modell ist:

X_train.shape = (77286, 100, 7)
Dabei:
77286: Anzahl der Trainingssequenzen
100: Zeitfenster (Sequenzlänge)
7: Anzahl der Features (Input-Parameter)

In [13]:
# Erstellen des Trainigs- und Validierungsdatensatzes
X_train, y_train = create_sequence(train_data,100)
X_val, y_val = create_sequence(test_data,100)

In [14]:
print('X_train:', X_train.shape, '\n y_train:', y_train.shape, '\n X_val:', X_val.shape, '\n y_val:', y_val.shape)

X_train: (77286, 100, 7) 
 y_train: (77286,) 
 X_val: (19246, 100, 7) 
 y_val: (19246,)


In [15]:
#Creating LSTM for regression

model = Sequential()
model.add(LSTM(units = 64, return_sequences=True, input_shape = (X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.1))
model.add(LSTM(units = 32))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_absolute_error'])

  super().__init__(**kwargs)


In [16]:
history = model.fit(X_train, y_train, epochs = 4, validation_data = (X_val, y_val), verbose = 1)

Epoch 1/4
[1m2416/2416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m304s[0m 123ms/step - loss: 0.0023 - mean_absolute_error: 0.0254 - val_loss: 0.0017 - val_mean_absolute_error: 0.0235
Epoch 2/4
[1m2416/2416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m311s[0m 119ms/step - loss: 5.3585e-04 - mean_absolute_error: 0.0129 - val_loss: 0.0015 - val_mean_absolute_error: 0.0190
Epoch 3/4
[1m2416/2416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m329s[0m 122ms/step - loss: 4.6414e-04 - mean_absolute_error: 0.0115 - val_loss: 0.0015 - val_mean_absolute_error: 0.0190
Epoch 4/4
[1m2416/2416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m293s[0m 121ms/step - loss: 4.3228e-04 - mean_absolute_error: 0.0108 - val_loss: 0.0015 - val_mean_absolute_error: 0.0187


In [17]:
# Speichern des Modells in einem Verzeichnis in Google Drive

model_dir = '/content/drive/My Drive/Colab_Notebooks/LSTM_Model/'
model_name = 'full_model.keras'
model_path = os.path.join(model_dir, model_name)
model.save(model_path)

print(f"Model saved to: {model_path}")

Model saved to: /content/drive/My Drive/Colab_Notebooks/LSTM_Model/full_model.keras


In [18]:
# Gewichte speichern
weights_dir = '/content/drive/My Drive/Colab_Notebooks/LSTM_Model/'
weights_name = 'model_weights.weights.h5'
weights_path = os.path.join(weights_dir, weights_name)
model.save_weights(weights_path)

print(f"weights saved to: {weights_path}")

weights saved to: /content/drive/My Drive/Colab_Notebooks/LSTM_Model/model_weights.weights.h5


In [19]:
print(X_val.shape, y_val.shape)

(19246, 100, 7) (19246,)


In [20]:
# Konvertiere X_val in ein flaches Format
X_val_flat = X_val.reshape(X_val.shape[0], -1)  # (Anzahl der Sequenzen, Sequenzlänge * Anzahl der Features)

# Kombiniere X_val und y_val in einen DataFrame
validation_data = pd.DataFrame(X_val_flat)
validation_data['y_val'] = y_val.flatten()  # Füge die Zielwerte hinzu

validation_data_path = '/content/drive/My Drive/Colab_Notebooks/LSTM_Model/validation_data.csv'
validation_data.to_csv(validation_data_path, index=False)

In [None]:
X_val_flat_path = save_validation_data(
    X_val=X_val,
    y_val=y_val,
    save_path='/content/drive/My Drive/Colab_Notebooks/LSTM_Model',
    file_name='validation_data.csv'
)

In [21]:
print("Shape of X_val_flat:", X_val_flat.shape)
print("Length of y_val:", len(y_val.flatten()))

Shape of X_val_flat: (19246, 700)
Length of y_val: 19246
