<a href="https://colab.research.google.com/github/rjanow/Masterarbeit/blob/main/2.1_Modeling_and_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Modeling and Predictions
[Notebook 1: EDA and Data Cleaning](./1_EDA_and_DataCleaning.ipynb)

[Notebook 2: Modeling and Predictions](./2_Modeling_and_Predictions.ipynb)

Notebook 3: Discussion and Technical Report

... Was geschieht in diesem Dokument? ...

1. Features
2. Resampling
3. Modeling
4. Scaling
5. Lagged features
6. Train test split, fit models, evaluate

## Verbinden mit der Google-Drive und Import der benötigten Module:

In [1]:
# Verbinden mit der Google-Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
## import modules
import os
import pandas as pd
import tensorflow as tf
import numpy as np

from tensorflow import keras
from keras.utils import to_categorical, normalize
from keras.layers import Input, Dense, LSTM
from keras.models import Model
from sklearn.model_selection import train_test_split

In [3]:
## Pfad an dem die Daten liegen
drive_path = '/content/drive/My Drive/Colab_Notebooks/Clean_Data/'

name_Messwerte = 'UVI_Messdaten.csv'
name_Vorhersage = 'Vorhersage.csv'
name_Solar_Messwerte = 'Solys_Messdaten.csv'

In [4]:
# check tensorflow version
print("tensorflow version:", tf.__version__)
# check available gpu
gpus =  tf.config.list_physical_devices('GPU')
print("available gpus:", gpus)
# limit the gpu usage, prevent it from allocating all gpu memory for a simple model
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
# check number of cpus available
print("available cpus:", os.cpu_count())

tensorflow version: 2.15.0
available gpus: []
available cpus: 2


In [5]:
# change model name for different models

# model_name = 'SUNSET_forecast_2017_2019_data'
# output_folder = os.path.join(pardir,"model_output", model_name)
# if os.path.isdir(output_folder)==False:
#     os.makedirs(output_folder)

## Import der Daten:

In [6]:
# Import der Daten für die Modellbildung
df_UVI = pd.read_csv(drive_path + name_Messwerte)
df_Solys = pd.read_csv(drive_path + name_Solar_Messwerte)
df_CAMS = pd.read_csv(drive_path + name_Vorhersage)

# Messzeitpunkt in Datetime umwandeln
df_UVI.set_index('Datetime', inplace=True)
df_Solys.set_index('Datetime', inplace=True)
df_CAMS.set_index('Datetime', inplace=True)

# Index in Datetime umwandeln
df_UVI.index = pd.to_datetime(df_UVI.index)
df_Solys.index = pd.to_datetime(df_Solys.index)
df_CAMS.index = pd.to_datetime(df_CAMS.index)

## Allgemeine Informationen:

In [7]:
# Allgemeine Informationen über die Messdaten
round(df_UVI.describe(), 2).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,120028.0,60013.5,34649.24,0.0,30006.75,60013.5,90020.25,120027.0
Messzeitpunkt,120028.0,40465.35,13493.43,12360.0,29460.0,40500.0,51480.0,68220.0
erythem,120026.0,0.03,0.04,0.0,0.0,0.02,0.05,0.21
UVI,120026.0,1.33,1.65,0.0,0.15,0.61,1.9,8.54
DiffGreater2,120028.0,0.01,0.1,0.0,0.0,0.0,0.0,1.0
SZA,120028.0,64.29,17.05,27.36,51.1,67.07,77.85,97.87
time_sin,120028.0,0.12,0.7,-1.0,-0.57,0.2,0.83,1.0
time_cos,120028.0,-0.58,0.4,-1.0,-0.92,-0.7,-0.32,0.62
date_sin,120028.0,-0.07,0.7,-1.0,-0.7,-0.25,0.7,1.0
date_cos,120028.0,-0.18,0.69,-1.0,-0.81,-0.33,0.48,1.0


In [8]:
# Allgemeine Informationen über die Messdaten
round(df_Solys.describe(), 2).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Glo,97260.0,254.13,250.23,0.0,52.9,167.03,387.52,1232.83
Dif,97260.0,125.54,109.78,0.0,45.45,91.53,178.38,655.32
Glo_SPLite,97260.0,259.03,248.57,0.0,57.96,173.37,397.06,1189.92
Dir,97260.0,249.7,324.94,0.0,2.04,17.96,531.48,1016.76
Temp,97260.0,16.69,8.59,-7.8,10.6,16.76,23.0,54.32


In [9]:
# Allgemeine Informationen über die Vorhersagedaten
round(df_CAMS.describe(), 2).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
aod469,4307.0,0.2,0.15,0.02,0.1,0.15,0.26,1.31
aod550,4307.0,0.16,0.12,0.01,0.08,0.13,0.21,1.08
aod670,4307.0,0.12,0.09,0.01,0.06,0.1,0.16,0.82
aod865,4307.0,0.01,0.0,0.0,0.01,0.01,0.01,0.01
uvbed,4307.0,1.26,1.56,-0.0,0.12,0.6,1.83,7.56
uvbedcs,4307.0,1.73,1.91,-0.0,0.21,0.9,2.79,7.67
hcc,4307.0,0.43,0.42,0.0,0.0,0.29,0.94,1.0
lcc,4307.0,0.41,0.4,0.0,0.01,0.28,0.87,1.0
mcc,4307.0,0.32,0.36,0.0,0.0,0.16,0.61,1.0
tcc,4307.0,0.72,0.35,0.0,0.46,0.93,1.0,1.0


In [10]:
# Daten zusammen führen
df_model = pd.concat([df_UVI[['UVI', 'SZA', 'time_sin', 'time_cos', 'date_sin', 'date_cos']], df_Solys[['Glo', 'Glo_SPLite']]], axis = 1)

In [25]:
df_model.dtypes     # solange alle Splaten das slebe Gleikommeformat haben kann der Dataframe wie ein np.array angesehen werden.

UVI           float64
SZA           float64
time_sin      float64
time_cos      float64
date_sin      float64
date_cos      float64
Glo           float64
Glo_SPLite    float64
dtype: object

## Normalisieren der Daten:

In [11]:
# Normalisieren der Daten
normalize(df_model, order = 1)

Unnamed: 0_level_0,UVI,SZA,time_sin,time_cos,date_sin,date_cos,Glo,Glo_SPLite
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-06-15 07:21:00,0.002038,0.046560,0.000794,-0.000293,0.000238,-0.000812,0.469116,0.480150
2022-06-15 07:23:00,0.002071,0.046030,0.000787,-0.000298,0.000237,-0.000807,0.469231,0.480539
2022-06-15 07:25:00,0.002073,0.045492,0.000779,-0.000303,0.000235,-0.000802,0.469698,0.480617
2022-06-15 07:27:00,0.002116,0.045006,0.000773,-0.000308,0.000234,-0.000798,0.469936,0.480829
2022-06-15 07:29:00,0.002119,0.044234,0.000761,-0.000311,0.000231,-0.000789,0.470385,0.481168
...,...,...,...,...,...,...,...,...
2023-08-19 05:48:00,0.000543,0.138086,0.001767,0.000093,-0.001309,-0.001190,0.411438,0.445574
2023-08-19 05:50:00,0.000551,0.137119,0.001763,0.000077,-0.001305,-0.001187,0.411886,0.446112
2023-08-19 05:52:00,0.000595,0.146476,0.001891,0.000066,-0.001400,-0.001273,0.407513,0.440786
2023-08-19 05:54:00,0.000600,0.145958,0.001893,0.000050,-0.001401,-0.001274,0.407681,0.441145


## Aufteilen der Daten:

In [39]:
# Aufteilen der Messdaten in Train-, Test und Validierungsset

# Zuerst teilen Sie den DataFrame in Trainingsdaten (70%) und einen kombinierten Validierungs-Test-Datensatz (30%).
df_train, df_val_test = train_test_split(df_model, test_size=0.3, random_state=42)
# Dann teilen Sie den kombinierten Validierungs-Test-Datensatz in Validierungs- und Testdatensätze (jeweils 50% davon).
df_val, df_test = train_test_split(df_val_test, test_size=0.5, random_state=42)

In [40]:
target = df_train.pop("UVI")

In [41]:
target = tf.convert_to_tensor(target)
numeric_features = tf.convert_to_tensor(df_train)

In [42]:
numeric_dataset = tf.data.Dataset.from_tensor_slices((numeric_features, target))

for row in numeric_dataset.take(3):
  print(row)

(<tf.Tensor: shape=(7,), dtype=float64, numpy=
array([55.38613605, -0.55193699, -0.83388582,  0.99661576, -0.08220115,
               nan,         nan])>, <tf.Tensor: shape=(), dtype=float64, numpy=0.4507529354028416>)
(<tf.Tensor: shape=(7,), dtype=float64, numpy=
array([ 8.47859806e+01, -9.96917334e-01, -7.84590957e-02, -8.53933854e-01,
       -5.20381564e-01,             nan,             nan])>, <tf.Tensor: shape=(), dtype=float64, numpy=0.0642799606304195>)
(<tf.Tensor: shape=(7,), dtype=float64, numpy=
array([61.08088896,  0.62592347, -0.77988448,  0.94567813,  0.32510441,
               nan,         nan])>, <tf.Tensor: shape=(), dtype=float64, numpy=0.3102470702866486>)


In [43]:
BATCH_SIZE = 15
numeric_batches = numeric_dataset.shuffle(1000).batch(BATCH_SIZE)

In [44]:
numeric_batches

<_BatchDataset element_spec=(TensorSpec(shape=(None, 7), dtype=tf.float64, name=None), TensorSpec(shape=(None,), dtype=tf.float64, name=None))>

## Building a Network:

Datenformatierung:
**Input**: shape: 15 timesteps und 8 features

In [16]:
# define the model architecture using tf.keras API
def simple_model():
    # Definition der Inputlayer für das Netzwerk
    x_in = Input(shape = (15,8), name = "input_layer", )

    return model

NameError: ignored

In [None]:
# show model architecture
simple_model().summary()

## Modell fitten:

In [15]:
# define model characteristics
num_filters = 24
kernel_size = [3,3]
pool_size = [2,2]
strides = 2
dense_size = 1024
drop_rate = 0.4

# define training time parameters
num_epochs = 200 #(The maximum epoches set to 200 and there might be early stopping depends on validation loss)
num_fold = 10 # 10-fold cross-validation
batch_size = 256
learning_rate = 3e-06

## Validierung des Netzwerks: