<a href="https://colab.research.google.com/github/rjanow/Masterarbeit/blob/main/2.1_Modeling_and_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Modeling and Predictions
[Notebook 1: EDA and Data Cleaning](./1_EDA_and_DataCleaning.ipynb)

[Notebook 2: Modeling and Predictions](./2_Modeling_and_Predictions.ipynb)

Notebook 3: Discussion and Technical Report

... Was geschieht in diesem Dokument? ...

1. Features
2. Resampling
3. Modeling
4. Scaling
5. Lagged features
6. Train test split, fit models, evaluate

## Verbinden mit der Google-Drive und Import der benötigten Module:

In [1]:
# Verbinden mit der Google-Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
## import modules
import os
import pandas as pd
import tensorflow as tf
import numpy as np

from tensorflow import keras
from keras.utils import to_categorical, normalize
from keras.layers import Input, Dense, LSTM
from keras.models import Model
from sklearn.model_selection import train_test_split

In [3]:
## Pfad an dem die Daten liegen
drive_path = '/content/drive/My Drive/Colab_Notebooks/Clean_Data/'

name_Messwerte = 'UVI_Messdaten.csv'
name_Vorhersage = 'Vorhersage.csv'
name_Solar_Messwerte = 'Solys_Messdaten.csv'

In [4]:
# check tensorflow version
print("tensorflow version:", tf.__version__)
# check available gpu
gpus =  tf.config.list_physical_devices('GPU')
print("available gpus:", gpus)
# limit the gpu usage, prevent it from allocating all gpu memory for a simple model
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
# check number of cpus available
print("available cpus:", os.cpu_count())

tensorflow version: 2.15.0
available gpus: []
available cpus: 2


In [5]:
# change model name for different models

# model_name = 'SUNSET_forecast_2017_2019_data'
# output_folder = os.path.join(pardir,"model_output", model_name)
# if os.path.isdir(output_folder)==False:
#     os.makedirs(output_folder)

## Import der Daten:

In [6]:
# Import der Daten für die Modellbildung
df_UVI = pd.read_csv(drive_path + name_Messwerte)
df_Solys = pd.read_csv(drive_path + name_Solar_Messwerte)
df_CAMS = pd.read_csv(drive_path + name_Vorhersage)

# Messzeitpunkt in Datetime umwandeln
df_UVI.set_index('Datetime', inplace=True)
df_Solys.set_index('Datetime', inplace=True)
df_CAMS.set_index('Datetime', inplace=True)

# Index in Datetime umwandeln
df_UVI.index = pd.to_datetime(df_UVI.index)
df_Solys.index = pd.to_datetime(df_Solys.index)
df_CAMS.index = pd.to_datetime(df_CAMS.index)

## Allgemeine Informationen:

In [7]:
# Allgemeine Informationen über die Messdaten
round(df_UVI.describe(), 2).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,120028.0,60013.5,34649.24,0.0,30006.75,60013.5,90020.25,120027.0
Messzeitpunkt,120028.0,40465.35,13493.43,12360.0,29460.0,40500.0,51480.0,68220.0
erythem,120026.0,0.03,0.04,0.0,0.0,0.02,0.05,0.21
UVI,120026.0,1.33,1.65,0.0,0.15,0.61,1.9,8.54
DiffGreater2,120028.0,0.01,0.1,0.0,0.0,0.0,0.0,1.0
SZA,120028.0,64.29,17.05,27.36,51.1,67.07,77.85,97.87
time_sin,120028.0,0.12,0.7,-1.0,-0.57,0.2,0.83,1.0
time_cos,120028.0,-0.58,0.4,-1.0,-0.92,-0.7,-0.32,0.62
date_sin,120028.0,-0.07,0.7,-1.0,-0.7,-0.25,0.7,1.0
date_cos,120028.0,-0.18,0.69,-1.0,-0.81,-0.33,0.48,1.0


In [8]:
# Allgemeine Informationen über die Messdaten
round(df_Solys.describe(), 2).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Glo,97260.0,254.13,250.23,0.0,52.9,167.03,387.52,1232.83
Dif,97260.0,125.54,109.78,0.0,45.45,91.53,178.38,655.32
Glo_SPLite,97260.0,259.03,248.57,0.0,57.96,173.37,397.06,1189.92
Dir,97260.0,249.7,324.94,0.0,2.04,17.96,531.48,1016.76
Temp,97260.0,16.69,8.59,-7.8,10.6,16.76,23.0,54.32


In [9]:
# Allgemeine Informationen über die Vorhersagedaten
round(df_CAMS.describe(), 2).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
aod469,4307.0,0.2,0.15,0.02,0.1,0.15,0.26,1.31
aod550,4307.0,0.16,0.12,0.01,0.08,0.13,0.21,1.08
aod670,4307.0,0.12,0.09,0.01,0.06,0.1,0.16,0.82
aod865,4307.0,0.01,0.0,0.0,0.01,0.01,0.01,0.01
uvbed,4307.0,1.26,1.56,-0.0,0.12,0.6,1.83,7.56
uvbedcs,4307.0,1.73,1.91,-0.0,0.21,0.9,2.79,7.67
hcc,4307.0,0.43,0.42,0.0,0.0,0.29,0.94,1.0
lcc,4307.0,0.41,0.4,0.0,0.01,0.28,0.87,1.0
mcc,4307.0,0.32,0.36,0.0,0.0,0.16,0.61,1.0
tcc,4307.0,0.72,0.35,0.0,0.46,0.93,1.0,1.0


In [10]:
# Daten zusammen führen
df_model = pd.concat([df_UVI[['UVI', 'SZA', 'time_sin', 'time_cos', 'date_sin', 'date_cos']], df_Solys[['Glo', 'Glo_SPLite']]], axis = 1)

In [11]:
df_model.dtypes     # solange alle Splaten das slebe Gleikommeformat haben kann der Dataframe wie ein np.array angesehen werden.

UVI           float64
SZA           float64
time_sin      float64
time_cos      float64
date_sin      float64
date_cos      float64
Glo           float64
Glo_SPLite    float64
dtype: object

In [12]:
# Extrahieren der Spalten, die für das Modell genutzt werden sollen
cols = list(df_model)[1:]

In [13]:
cols

['SZA', 'time_sin', 'time_cos', 'date_sin', 'date_cos', 'Glo', 'Glo_SPLite']

## Normalisieren der Daten:

Keras.utils.normalize():
- L1-Norm = Summennorm (auch als Mahatten Distance bekannt)
- L2-Norm = Eine Norm auf den Raum quadratintegrierbarer Funktionen (Hilbertraum)

In [67]:
def normalize_ZScore(df):
    """
    Normalisiert einen DataFrame, indem für jede Spalte der Z-Score berechnet wird.

    :param df: Der DataFrame, der normalisiert werden soll.
    :return: Ein normalisierter DataFrame sowie die Mittelwerte und Standardabweichungen.
    """
    # Berechnen des Mittelwerts und der Standardabweichung für jede Spalte
    mean = df.mean(axis=0)
    std = df.std(axis=0)

    # Anwenden der Z-Score-Normalisierung
    normalized_df = (df - mean) / std

    return normalized_df, mean, std

In [68]:
def normalize_MinMax(df):
    """
    Normalisiert einen DataFrame, indem für jede Spalte die Min-Max-Normalisierung durchgeführt wird.

    :param df: Der DataFrame, der normalisiert werden soll.
    :return: Ein normalisierter DataFrame sowie die Minima und Maxima für jede Spalte.
    """
    # Berechnen des Minimums und Maximums für jede Spalte
    min_val = df.min(axis=0)
    max_val = df.max(axis=0)

    # Anwenden der Min-Max-Normalisierung
    normalized_df = (df - min_val) / (max_val - min_val)

    return normalized_df, min_val, max_val

In [69]:
normalized_ZS_df, mean, std = normalize_ZScore(df_model)
normalized_MinMax_df, Min, Max = normalize_MinMax(df_model)
normalized_MinMax_df.head()

Unnamed: 0_level_0,UVI,SZA,time_sin,time_cos,date_sin,date_cos,Glo,Glo_SPLite
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-06-15 07:21:00,0.281919,0.39243,0.969096,0.403006,0.640625,0.020185,0.449763,0.476942
2022-06-15 07:23:00,0.28824,0.387969,0.967568,0.397968,0.640623,0.020184,0.452447,0.480058
2022-06-15 07:25:00,0.290191,0.383511,0.966004,0.392947,0.640621,0.020184,0.455624,0.483028
2022-06-15 07:27:00,0.297739,0.379057,0.964405,0.387943,0.640619,0.020183,0.458118,0.48564
2022-06-15 07:29:00,0.301597,0.374606,0.96277,0.382956,0.640618,0.020183,0.46385,0.491593


## Aufteilen der Daten:

In [28]:
# Angabe der Timesteps die für die Modellbildung genutzt werden sollen und Angabe der Features, die für das Training genutzt werden sollen.
n_timesteps = 15
n_features = len(normalized_MinMax_df.columns)-1

In [46]:
def create_sequences(df, n_timesteps, target_column):
    """
    Erstellt sequentielle Eingabedaten für ein neuronales Netzwerk aus einem DataFrame.

    :param df: DataFrame, der die Zeitreihendaten enthält.
    :param n_timesteps: Anzahl der Timesteps, die in jeder Sequenz verwendet werden sollen.
    :param target_column: Name der Spalte, die als Zielvariable verwendet werden soll.
    :return: Eine Liste von Sequenzen, wobei jede Sequenz aus Eingabedaten und einem Zielwert besteht.
    """
    transformed_data = []

    # Schleife durch den DataFrame, um sequentielle Daten zu erstellen
    for i in range(n_timesteps, len(df)):
        # Alle Features außer der Zielvariablen
        input_data = df.iloc[i-n_timesteps:i].drop(columns=target_column).values.reshape(1, -1)
        # Zielvariable
        target = df.iloc[i][target_column]
        # Hinzufügen zur Liste
        transformed_data.append(np.append(input_data, target))

    return transformed_data

In [47]:
transformed_data = create_sequences(normalized_MinMax_df, n_timesteps=15, target_column='UVI')
transformed_df = pd.DataFrame(transformed_data)

In [52]:
transformed_df[105].max()

0.01327319331213285

In [49]:
train_size = int(0.7 * len(transformed_df))
val_size = int(0.15 * len(transformed_df))

In [50]:

train_data = transformed_df[:train_size]
val_data = transformed_df[train_size:train_size + val_size]
test_data = transformed_df[train_size + val_size:]

In [51]:
len_train, len_val, len_test = len(train_data), len(val_data), len(test_data)
len_train, len_val, len_test

(84009, 18001, 18003)

## Building a Network:

Datenformatierung:
**Input**: shape: 15 timesteps und 8 features

In [None]:
# define the model architecture using tf.keras API
def simple_model():
    # Definition der Inputlayer für das Netzwerk
    x_in = Input(shape = (15,8), name = "input_layer", )

    return model

In [None]:
# show model architecture
simple_model().summary()

## Modell fitten:

In [None]:
# define model characteristics
num_filters = 24
kernel_size = [3,3]
pool_size = [2,2]
strides = 2
dense_size = 1024
drop_rate = 0.4

# define training time parameters
num_epochs = 200 #(The maximum epoches set to 200 and there might be early stopping depends on validation loss)
num_fold = 10 # 10-fold cross-validation
batch_size = 256
learning_rate = 3e-06

## Validierung des Netzwerks: