<a href="https://colab.research.google.com/github/rjanow/Masterarbeit/blob/main/1_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predicting UVI with LSTMs

[Notebook 0: Data Cleaning](./0_DataCleaning.ipynb)

[Notebook 1: EDA](./1_EDA.ipynb)

[Notebook 2: Modeling and Predictions](./2_Modeling_and_Predictions.ipynb)

[Notebook 3: Technical Report](./3_Technical_Report.ipynb)

Was wird in diesem Notebook gemacht?

## Verbinden mit der Google Drive und Import der benötigten Module:

In [None]:
# Verbinden mit der Google-Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install factor_analyzer

In [None]:
## import modules

import os
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

from datetime import datetime
from datetime import timedelta

from scipy.fft import fft

In [None]:
# Größe der Plots festlegen
matplotlib.rcParams['figure.figsize'] = (13, 8)

In [None]:
latitude = 50.2
longitude = 7.8

seconds_in_day = 24*60*60
seconds_in_year = (365.2425)*seconds_in_day

## Import der Daten:

In [None]:
# Pfad zur CSV-Datei auf Google Drive
name_Messwerte = 'UVI_Messdaten.csv'
name_Vorhersage = 'Vorhersage.csv'
name_Solar_Messwerte = 'Solys_Messdaten.csv'
drive_path = '/content/drive/My Drive/Colab_Notebooks/Clean_Data/'
output_plot_path = '/content/drive/My Drive/Colab_Notebooks/plot_daily_UVI/'

In [None]:
# Import der Daten für die EDA
df_UVI = pd.read_csv(drive_path + name_Messwerte)
df_Solys = pd.read_csv(drive_path + name_Solar_Messwerte)
df_CAMS = pd.read_csv(drive_path + name_Vorhersage)

# Messzeitpunkt in Datetime umwandeln
df_UVI.set_index('Datetime', inplace=True)
df_Solys.set_index('Datetime', inplace=True)
df_CAMS.set_index('Datetime', inplace=True)

# Index in Datetime umwandeln
df_UVI.index = pd.to_datetime(df_UVI.index)
df_Solys.index = pd.to_datetime(df_Solys.index)
df_CAMS.index = pd.to_datetime(df_CAMS.index)

# NaN Einträge durch 0 ersetzen
df_UVI['UVI'] = df_UVI['UVI'].fillna(0)
df_UVI['erythem'] = df_UVI['erythem'].fillna(0)
df_Solys.fillna(0)

In [None]:
# Stündlicher Durchschitt ohne Nullen
df_UVI_hourly = df_UVI[df_UVI['UVI'] != 0][['UVI','erythem', 'SZA']].resample('H').mean()

In [None]:
# Stündlicher Durchschitt mit Nullen
df_UVI_hourly_withZero = df_UVI[['UVI','erythem', 'SZA']].resample('H').mean()

## Allgemeine Informationen:

In [None]:
# Dateitypen der einzelnen Spalten
print(df_UVI.dtypes, df_Solys.dtypes, df_CAMS.dtypes)

In [None]:
# Allgemeine Informationen über die Messdaten
round(df_UVI.describe(), 2).T

In [None]:
# Allgemeine Informationen über die Messdaten
round(df_Solys.describe(), 2).T

In [None]:
# Allgemeine Informationen über die Vorhersagedaten
round(df_CAMS.describe(), 2).T

In [None]:
# Prüfen, ob es Nullwerte in den Daten gibt
print(df_UVI.info(), '\n --------------------- \n', df_UVI.isnull().sum())

In [None]:
# Allgemeine Informationen über die stündlich aufgelösten Messdaten
df_UVI_hourly.info()

In [None]:
df_UVI_hourly_withZero.info()

In [None]:
df_CAMS.info()

## Outliner finden:

In [None]:
# Interquartile Range berechnen

Q1_UVI = df_UVI['UVI'].quantile(0.25)
Q2_UVI = df_UVI['UVI'].quantile(0.5)
Q3_UVI = df_UVI['UVI'].quantile(0.75)

IQR_UVI = Q3_UVI - Q1_UVI

print(f"25. Perzentil: {round(Q1_UVI, 3)}, 5. Perzentil: {round(Q2_UVI, 3)}, 75. Perzentil: {round(Q3_UVI, 3)}, IQR: {round(IQR_UVI, 3)}")

In [None]:
# Z-score berechnen

df_ZScore = pd.DataFrame()

mittelwert = df_UVI['UVI'].mean()
standardabweichung = df_UVI['UVI'].std()

df_ZScore['Z-Score'] = (df_UVI['UVI'] - mittelwert) / standardabweichung
plt.plot(df_ZScore)

## Plotten der Dataframes:

### Plot der Tagesdurchschnitte:

In [None]:
# Plot der Tagessumme
df_UVI.groupby('Date').sum()[['UVI']].plot()
plt.title('Plot der Tagessumme');
plt.xticks(rotation=45)

In [None]:
# Average DNI/GHI/DHI over DOY.
df_UVI.groupby('Uhrzeit').mean()[['UVI']].plot()
plt.title('Sum GHI/DNI/DHI by time of day');

In [None]:
# Irradiance metrics over time
df_UVI[['UVI']].plot()
plt.title('UVI über die Zeit');

In [None]:
def plot_daily_mean_data(df1, df2, value_column1='UVI', value_column2='Wert2'):

    # Resample auf tägliche Auflösung und Berechnung des Durchschnitts
    df1_daily = df1.resample('D').mean()
    df2_daily = df2.resample('D').mean()

    # Erstellen Sie den Plot
    plt.figure(figsize=(12, 6))
    plt.plot(df1_daily.index, df1_daily[value_column1], label='DataFrame 1', marker='o')
    plt.plot(df2_daily.index, df2_daily[value_column2], label='DataFrame 2', marker='x')

    plt.title('Tägliche Messwerte')
    plt.xlabel('Datum')
    plt.ylabel('Durchschnittlicher Messwert')
    plt.legend()
    plt.grid(True)

    # Rotieren Sie die X-Achsenbeschriftungen für bessere Lesbarkeit
    plt.xticks(rotation=45)

    # Zeigen Sie den Plot an
    plt.tight_layout()
    plt.show()

In [None]:
# plot_daily_mean_data(df_UVI, df_CAMS)

### Plot der täglichen Verläufe:

In [None]:
df_plot_daily = pd.concat([df_UVI[['UVI', 'SZA']], df_CAMS[['aod469', 'uvbed', 'uvbedcs', 'tcc']]], axis = 1)

In [None]:
def create_daily_plots(df, columns_to_plot):
    # Gruppieren nach Tag
    grouped = df.groupby(df.index.date)

    # Stil für Seaborn-Plots festlegen
    sns.set(style="darkgrid")

    for date, group in grouped:
        fig, ax1 = plt.subplots(figsize=(10, 6))  # Hauptfigur und erste Achse erstellen

        twin1 = ax1.twinx()
        twin2 = ax1.twinx()
        twin3 = ax1.twinx()
        twin4 = ax1.twinx()

        twin1.spines.right.set_position(("axes", 1))
        twin2.spines.right.set_position(("axes", 1.1))
        twin3.spines.right.set_position(("axes", 1.22))
        twin4.spines.right.set_position(("axes", 1.33))

        # Erste Achse erstellen (links)
        sns.set(style="whitegrid")

        # # Zweiter Plot mit Seaborn auf der zweiten Y-Achse
        sns.lineplot(data=df, x=df.index, y='UVI', label='UVI', marker='o', color='b', ax=ax1)

        sns.lineplot(data=df, x=df.index, y='SZA', label='SZA', marker='o', color='r', ax=twin1)

        sns.lineplot(data=df, x=df.index, y='uvbed', label='UVI', marker='o', color='g', ax=twin2)

        sns.lineplot(data=df, x=df.index, y='uvbedcs', label='UVI CS', marker='o', color='k', ax=twin3)

        sns.lineplot(data=df, x=df.index, y='tcc', label='TCC', marker='o', color='y', ax=twin4)

        plt.xlabel('Uhrzeit')
        plt.legend(loc='upper left')
        plt.title(f'Tagesverlauf für den {date}')
        plt.show()

In [None]:
cols_to_plot = ['UVI', 'SZA', 'aod469', 'uvbed', 'uvbedcs', 'tcc']
create_daily_plots(df_plot_daily[0:300], cols_to_plot)

## Korrelation der Stündlichen Mittelwerte:

Berechnen der Varabilitätsindices Indizes:

In [None]:
df_corr_hour = pd.concat([df_UVI_hourly[['UVI', 'SZA']], df_CAMS[['aod469', 'aod550', 'hcc', 'mcc', 'lcc', 'tcc', 'uvbed', 'uvbedcs']]], axis = 1)

In [None]:
# Increase the size of the heatmap.
plt.figure(figsize=(18, 8))
# Store heatmap object in a variable to easily access it when you want to include more features (such as title).
# Set the range of values to be displayed on the colormap from -1 to 1, and set the annotation to True to display the correlation values on the heatmap.
heatmap = sns.heatmap(df_corr_hour.corr(), fmt='.2f', vmin=-1, vmax=1, annot=True, square=True)
# Give a title to the heatmap. Pad defines the distance of the title from the top of the heatmap.
heatmap.set_title('Korrelation zwischen UVI und meteorologischen Daten', fontdict={'fontsize':12}, pad=12);

In [None]:
sns.heatmap(df_corr_hour.corr()[['UVI']], annot=True)
plt.title('Korrelation zwischen UVI und Features');

## Korrelation zwischen den Solarstrahlungsdaten und dem UVI:

In [None]:
df_corr = pd.concat([df_UVI[['UVI', 'SZA']], df_Solys[['Glo', 'Dif', 'Glo_SPLite', 'Dir', 'Temp']]], axis = 1)

In [None]:
sns.heatmap(df_corr.corr()[['UVI']], annot=True)
plt.title('Korrelation zwischen UVI und Features');

## Paarweise Vergleich des UVI und der Vorhersagewerte:

In [None]:
# plot pairwise relationships of the dataset
sns.set()  # set the seaborn's theme as default
g = sns.pairplot(df_corr_hour)

# set the title
g.fig.suptitle("Pairwise Relations between Features", y=1.03);

## Effekt der Solarstrahlung auf den UVI:

In [None]:
df_Solys, df_UVI

# check the relationship between `Radiation` and `SystemProduction`
sns.set()
df_Solys_UVI_1M = pd.concat([df_UVI[['UVI', 'SZA']], df_Solys], axis = 1).resample("1M").sum()
df_Solys_UVI_1M = df_Solys_UVI_1M.agg(np.log10)

# plot the line graph
df_Solys_UVI_1M.plot(figsize=(7, 6), xlabel="Date", ylabel="Log Scale");

# set the title
plt.title("Relationship between Solar Radiation, Sunshine & Energy Production");

# set the legend to outside the grid
plt.legend(bbox_to_anchor=(1, 1));

## Effekt der gemessenen und Vorhergesagten Temperatur auf den UVI:

In [None]:
# check the proportions of the AirTemperature categories
df_Solys_UVI = pd.concat([df_UVI[['UVI', 'SZA']], df_Solys], axis = 1)

df_Solys_UVI.loc[:, "Temp"] = pd.cut(df_Solys_UVI.Temp,
                                         bins=[-np.inf, 0, 15, 20, 25, 30, np.inf], # get the bins for a total of 6 labels
                                         labels=['Very Cold', 'Cold', 'Standard', 'Ordinary', 'Room Temp', 'Moderate'])\
                                    .cat.remove_unused_categories() # remove the empty categories

sns.set()  # set the seaborn theme to default
sns.barplot(data=df_Solys_UVI.Temp.value_counts(normalize=True).to_frame().T)  # plotting a barplot

# set the labels and title
plt.xlabel("Air Temperature")
plt.ylabel("Proportions")
plt.title("Distribution of Air Temperature");

In [None]:
# Effect of Air Temperature on Power Generation by Sunlight
sns.set()
df_Solys_UVI.loc[:, "Glo"] = pd.cut(df_Solys_UVI.Glo, bins=3,  # make 3 bins of the data for 3 labels
                                   labels=["Low", "Moderate", "High"]).cat.remove_unused_categories()

plt.subplots(figsize=(15, 4))  # set the figure dimensions
sns.boxplot(data=df_Solys_UVI, x="Temp", hue="Glo", y="UVI", sym="")  # plot a boxplot with no outliers

# set the labels and title
plt.xlabel("Air Temperature")
plt.ylabel("UVI")
plt.title("Effect of Air Temperature on UVI by Solar Radiation");

In [None]:
df_Solys_UVI

## Korrelation zwischen der Strahlungsmessung und dem UVI

In [None]:
df_Solys_UVI_corr = pd.concat([df_UVI[['UVI', 'SZA']], df_Solys], axis = 1)

sns.set()  # set the seaborn's theme as default
g = sns.pairplot(df_Solys_UVI_corr)

# set the title
g.fig.suptitle("Pairwise Relations between Features", y=1.03);

In [None]:
sns.scatterplot(data=df_Solys_UVI_corr, x='UVI', y='Glo')
plt.show()

## Finden von Nullwerten in den UVI Daten

In [None]:
# Zeilen an denen die Solys Daten NaN-Einträge besitzen
df_Solys_UVI_corr[df_Solys_UVI_corr.isnull().any(axis=1)]

In [None]:
df_Solys_UVI_corr