<a href="https://colab.research.google.com/github/rjanow/Masterarbeit/blob/main/1_2_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Verbinden mit der Google Drive und Import der benötigten Module:

In [1]:
# Verbinden mit der Google-Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import io, os, sys, setuptools, tokenize

In [3]:
# !pip install pvlib

In [4]:
## import modules

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import pvlib

from datetime import datetime
from datetime import timedelta
from prophet import Prophet
from math import sqrt

from scipy.fft import fft

In [5]:
# Größe der Plots festlegen
matplotlib.rcParams['figure.figsize'] = (13, 8)

In [6]:
latitude = 50.2
longitude = 7.8

seconds_in_day = 24*60*60
seconds_in_year = (365.2425)*seconds_in_day

tz, altitude, name = 'Europe/Berlin', 70, 'SanktAugustin'
tus = pvlib.location.Location(latitude, longitude, tz, altitude, name)

In [7]:
# Pfad zur CSV-Datei auf Google Drive
name_Messwerte = 'UVI_Messdaten.csv'
name_Vorhersage = 'Vorhersage.csv'
name_Solar_Messwerte = 'Solys_Messdaten.csv'
name_VarIdx = 'Var_Index.csv'

drive_path = '/content/drive/My Drive/Colab_Notebooks/Clean_Data/'
output_plot_path = '/content/drive/My Drive/Colab_Notebooks/plot_daily_UVI/'

In [8]:
# Import der Daten für die EDA
df_UVI = pd.read_csv(drive_path + name_Messwerte)
df_Solys = pd.read_csv(drive_path + name_Solar_Messwerte)
df_CAMS = pd.read_csv(drive_path + name_Vorhersage)
df_VarIdx = pd.read_csv(drive_path + name_VarIdx)

# Messzeitpunkt in Datetime umwandeln
df_UVI.set_index('Datetime', inplace=True)
df_Solys.set_index('Datetime', inplace=True)
df_CAMS.set_index('Datetime', inplace=True)
df_VarIdx.set_index('Datetime', inplace=True)

# Index in Datetime umwandeln
df_UVI.index = pd.to_datetime(df_UVI.index)
df_Solys.index = pd.to_datetime(df_Solys.index)
df_CAMS.index = pd.to_datetime(df_CAMS.index)
df_VarIdx.index = pd.to_datetime(df_VarIdx.index)

# NaN Einträge durch 0 ersetzen
df_UVI['UVI'] = df_UVI['UVI'].fillna(0)
df_UVI['erythem'] = df_UVI['erythem'].fillna(0)
df_Solys.dropna(inplace = True)

In [9]:
# Stündlicher Durchschitt ohne Nullen
df_UVI_hourly = df_UVI[df_UVI['UVI'] != 0][['UVI','erythem', 'SZA']].resample('H').mean()

In [10]:
# Stündlicher Durchschitt mit Nullen
df_UVI_hourly_withZero = df_UVI[['UVI','erythem', 'SZA']].resample('H').mean()

# Clear-Sky und All-Sky Tage gruppieren:

In [11]:
df_VarIdx_Day = df_VarIdx.resample('D').mean()

In [12]:
df_VarIdx_Day.describe()

Unnamed: 0,index_sigma,index_coimbra,index_stein
count,289.0,289.0,289.0
mean,0.345894,1.052174,9.053808
std,2.767702,6.029707,7.983077
min,0.001988,0.003466,0.386333
25%,0.017134,0.091783,2.673958
50%,0.03274,0.148616,6.33659
75%,0.085007,0.249296,13.126597
max,44.79187,72.502588,40.420145


In [13]:
# Funktion um die Messdaten nach ihrer Variabilität zu klassifizieren
def kategorie_zuordnen(row, avg_values, coimbra):
    avg = avg_values.get(row.name.floor('D'), np.nan)

    if pd.isna(avg):
        return 'Unbekannt'  # Für den Fall, dass kein Durchschnittswert vorhanden ist
    elif 1 < avg < 3:
        return 1, avg, coimbra
    elif 20 < avg < 41:
        return 3, avg, coimbra
    else:
        return 2, avg, coimbra

# Tägliche Durchschnittswerte aus DataFrame A in DataFrame B einfügen
# Anwenden der Funktion und Zuweisung der Ergebnisse zu neuen Spalten
temp_df = df_Solys.apply(kategorie_zuordnen, avg_values = df_VarIdx_Day['index_stein'].to_dict(), coimbra = df_VarIdx_Day['index_coimbra'].to_dict(), axis=1)

# Benenne die Spalten des temporären DataFrame um
temp_df = pd.DataFrame(temp_df.tolist(), index=temp_df.index)

# Füge die neuen Spalten zum ursprünglichen DataFrame hinzu
df_Solys = pd.concat([df_Solys, temp_df[[0, 1, 2]]], axis=1)
df_Solys.rename(columns={0: 'Var', 1: 'index_stein', 2: 'index_coimbra'}, inplace = True)

In [14]:
df_Solys

Unnamed: 0_level_0,Glo,Dif,Glo_SPLite,Dir,Temp,Var,index_stein,index_coimbra
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-06-15 07:21:00,554.483333,72.400000,567.525000,853.966667,27.258333,2,3.245913,"{2022-06-15 00:00:00: 0.06926161090556342, 202..."
2022-06-15 07:23:00,557.791667,73.475000,571.233333,851.991667,28.058333,2,3.245913,"{2022-06-15 00:00:00: 0.06926161090556342, 202..."
2022-06-15 07:25:00,561.708333,74.800000,574.766667,850.225000,28.616667,2,3.245913,"{2022-06-15 00:00:00: 0.06926161090556342, 202..."
2022-06-15 07:27:00,564.783333,75.383333,577.875000,848.400000,26.791667,2,3.245913,"{2022-06-15 00:00:00: 0.06926161090556342, 202..."
2022-06-15 07:29:00,571.850000,76.233333,584.958333,852.433333,28.600000,2,3.245913,"{2022-06-15 00:00:00: 0.06926161090556342, 202..."
...,...,...,...,...,...,...,...,...
2023-08-19 05:48:00,232.516667,148.866667,251.808333,419.441667,24.183333,1,2.673958,"{2022-06-15 00:00:00: 0.06926161090556342, 202..."
2023-08-19 05:50:00,233.466667,145.483333,252.866667,426.958333,24.375000,1,2.673958,"{2022-06-15 00:00:00: 0.06926161090556342, 202..."
2023-08-19 05:52:00,215.358333,140.125000,232.941667,356.283333,24.550000,1,2.673958,"{2022-06-15 00:00:00: 0.06926161090556342, 202..."
2023-08-19 05:54:00,215.333333,134.258333,233.008333,372.275000,24.800000,1,2.673958,"{2022-06-15 00:00:00: 0.06926161090556342, 202..."


In [15]:
df_Solys_VarHigh = pd.merge(df_Solys[['Glo', 'Dir', 'Var', 'index_stein', 'index_coimbra']][df_Solys['Var'] == 3], df_UVI['UVI'], left_index=True, right_index=True, how='inner')
# df_Solys_VarHigh = pd.merge(df_Solys_VarHigh, df_CAMS[['uvbed', 'uvbedcs']], left_index=True, right_index=True, how='inner')

df_Solys_VarLow = pd.merge(df_Solys[['Glo', 'Dir', 'Var', 'index_stein', 'index_coimbra']][df_Solys['Var'] == 1], df_UVI['UVI'], left_index=True, right_index=True, how='inner')
# df_Solys_VarLow = pd.merge(df_Solys_VarLow, df_CAMS[['uvbed', 'uvbedcs']], left_index=True, right_index=True, how='inner')

In [16]:
def plot_Var_days(df, save_dir, Var):

    # Erstelle einen Ordner im Drive, falls nicht vorhanden
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # Gruppiere die Daten nach Datum (Tag)
    grouped = df.groupby(df.index.date)

    for date, group in grouped:
        fig, ax1 = plt.subplots(figsize=(10, 6))

        # Erste Achse für 'Glo'
        color = 'tab:red'
        sns.lineplot(x=group.index, y=group['Glo'], ax=ax1, color=color, label='Glo')
        ax1.set_xlabel('Uhrzeit')
        ax1.set_ylabel('Globalstrahlung [W/m2]', color=color)
        ax1.tick_params(axis='y', labelcolor=color)

        ax1.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))

        # Zweite Achse für 'UVI'
        ax2 = ax1.twinx()
        color = 'tab:blue'
        sns.lineplot(x=group.index, y=group['UVI'], ax=ax2, color=color, label='UVI')
        ax2.set_ylabel('UV-Index', color=color)
        ax2.tick_params(axis='y', labelcolor=color)

        index_stein = group['index_stein'].iloc[0]
        index_coimbra = group['index_coimbra'].iloc[0]

        plt.title(f'Daten für {Var}:{date} \n Index Stein: {index_stein}, Index Coimbra: {index_coimbra},')
        fig.tight_layout()

        # Speichere den Plot im vorher erstellten Ordner
        file_name = f'{date}.png'
        file_path = os.path.join(save_dir, file_name)
        plt.savefig(file_path)
        plt.close()  # Schließe die Figur, um Ressourcen freizugeben

In [17]:
plot_Var_days(df_Solys_VarHigh, '/content/drive/My Drive/Colab_Notebooks/HighVarDays_Plots', 'High-Var-Tag')

In [18]:
plot_Var_days(df_Solys_VarLow, '/content/drive/My Drive/Colab_Notebooks/LowVarDays_Plots', 'Low-Var-Tag')

KeyboardInterrupt: 

ValueError: Image size of 229340x568 pixels is too large. It must be less than 2^16 in each direction.

<Figure size 1000x600 with 2 Axes>

## Dataframes hoher und niedriger Variabilität:

In [None]:
# Korrelation an Tagen mit hoher Variabilität
sns.heatmap(df_Solys_VarHigh[['UVI', 'Glo', 'Dir']].corr()[['UVI']], annot=True)
plt.title('Korrelation zwischen UVI und Glo/CAMS');
plt.savefig('/content/drive/My Drive/Colab_Notebooks/plot_TSA/UVI_Cams.png')

In [None]:
# Korrelation an Tagen mit niedriger Variabilität
sns.heatmap(df_Solys_VarLow[['UVI', 'Glo', 'Dir']].corr()[['UVI']], annot=True)

# Univarainte Analyse:

In [None]:
# Dateitypen der einzelnen Spalten
print(df_UVI.dtypes, df_Solys.dtypes, df_CAMS.dtypes)

In [None]:
# Allgemeine Informationen über die Messdaten
round(df_UVI.describe(), 2).T

In [None]:
# Allgemeine Informationen über die Messdaten
round(df_Solys.describe(), 2).T

In [None]:
# Allgemeine Informationen über die Vorhersagedaten
round(df_CAMS.describe(), 2).T

In [None]:
# Allgemeine Informationen über die Vorhersagedaten
round(df_VarIdx.describe(), 2).T

In [None]:
# Prüfen, ob es Nullwerte in den Daten gibt
print(df_UVI.info(), '\n --------------------- \n', df_UVI.isnull().sum())

In [None]:
# Allgemeine Informationen über die stündlich aufgelösten Messdaten
df_UVI_hourly.info()

In [None]:
df_UVI_hourly_withZero.info()

In [None]:
df_CAMS.info()

## Histogram der einzelnen Werte:

In [None]:
# Verteilung der gemessenen UVI-Werte
n, bins, patches = plt.hist(
x=df_UVI['UVI'], bins='auto',
alpha=0.7, rwidth=0.85
)
plt.grid(axis='y', alpha=0.75)
plt.title('Histogram UVI Verteilung')
plt.xlabel('UVI')
plt.ylabel('Frequency')

In [None]:
# Verteilung der SZA über alle Messdaten
n, bins, patches = plt.hist(
x=df_UVI['SZA'], bins='auto',
alpha=0.7, rwidth=0.85
)
plt.grid(axis='y', alpha=0.75)
plt.title('Histogram SZA Verteilung')
plt.xlabel('SZA')
plt.ylabel('Frequency')

## Outliner finden:

In [None]:
# Interquartile Range berechnen

Q1_UVI = df_UVI['UVI'].quantile(0.25)
Q2_UVI = df_UVI['UVI'].quantile(0.5)
Q3_UVI = df_UVI['UVI'].quantile(0.75)

IQR_UVI = Q3_UVI - Q1_UVI

print(f"25. Perzentil: {round(Q1_UVI, 3)}, 5. Perzentil: {round(Q2_UVI, 3)}, 75. Perzentil: {round(Q3_UVI, 3)}, IQR: {round(IQR_UVI, 3)}")

In [None]:
# Z-score über alle Messdaten berechnen

df_ZScore = pd.DataFrame()

mittelwert = df_UVI['UVI'].mean()
standardabweichung = df_UVI['UVI'].std()

df_ZScore['Z-Score'] = (df_UVI['UVI'] - mittelwert) / standardabweichung
plt.plot(df_ZScore)
plt.title('Z-Score des UVI')

## Plotten der Dataframes:

In [None]:
# Plot der Erythem-Tagessumme
df_UVI.groupby('Date').sum()[['erythem']].plot()
plt.title('Plot der Erythem-Tagessumme');
plt.xticks(rotation=45)

In [None]:
# Average DNI/GHI/DHI over DOY.

# df_UVI.groupby('Uhrzeit').mean()[['UVI']].plot()
# plt.title('Durchschnitt des UVI über die Tageszeit');

# Multivarainte Analyse:

## Korrelation der Stündlichen Mittelwerte:

In [None]:
# Dataframe für den Plot erstellen
df_corr_hour = pd.concat([df_UVI_hourly[['UVI', 'SZA']], df_CAMS[['aod469', 'aod550', 'hcc', 'mcc', 'lcc', 'tcc', 'uvbed', 'uvbedcs']]], axis = 1)

In [None]:
# Increase the size of the heatmap.
plt.figure(figsize=(18, 8))
# Store heatmap object in a variable to easily access it when you want to include more features (such as title).
# Set the range of values to be displayed on the colormap from -1 to 1, and set the annotation to True to display the correlation values on the heatmap.
heatmap = sns.heatmap(df_corr_hour.corr(), fmt='.2f', vmin=-1, vmax=1, annot=True, square=True)
# Give a title to the heatmap. Pad defines the distance of the title from the top of the heatmap.
heatmap.set_title('Korrelation zwischen UVI und meteorologischen Daten', fontdict={'fontsize':12}, pad=12);
plt.savefig('/content/drive/My Drive/Colab_Notebooks/plot_TSA/UVI_Cams.png')

In [None]:
sns.heatmap(df_corr_hour[['UVI','aod469', 'aod550', 'hcc', 'mcc', 'lcc', 'tcc', 'uvbed', 'uvbedcs']].corr()[['UVI']], annot=True)
plt.title('Korrelation zwischen UVI und den meteorologischen Daten');
plt.savefig('/content/drive/My Drive/Colab_Notebooks/plot_TSA/UVI_Cams.png')

## Korrelation zwischen den Solarstrahlungsdaten und dem UVI:

In [None]:
df_corr = pd.concat([df_UVI[['UVI', 'SZA']], df_Solys[['Glo', 'Dif', 'Glo_SPLite', 'Dir', 'Temp']]], axis = 1)

In [None]:
sns.heatmap(df_corr.corr()[['UVI']], annot=True)
# plt.title('Korrelation zwischen UVI und SOLYS Daten');
plt.savefig('/content/drive/My Drive/Colab_Notebooks/plot_TSA/UVI_Solys.png')

## Paarweise Vergleich des UVI und der Vorhersagewerte:

In [None]:
# plot pairwise relationships of the dataset
sns.set()  # set the seaborn's theme as default
g = sns.pairplot(df_corr_hour)

# set the title
g.fig.suptitle("Pairwise Relations between Features", y=1.03);

## Effekt der Solarstrahlung auf den UVI:

In [None]:
df_Solys, df_UVI

# check the relationship between `Radiation` and `SystemProduction`
sns.set()
df_Solys_UVI_1M = pd.concat([df_UVI[['UVI', 'SZA']], df_Solys], axis = 1).resample("1M").sum()
df_Solys_UVI_1M = df_Solys_UVI_1M.agg(np.log10)

# plot the line graph
df_Solys_UVI_1M.plot(figsize=(7, 6), xlabel="Date", ylabel="Log Scale");

# set the title
plt.title("Relationship between Solar Radiation, Sunshine & Energy Production");

# set the legend to outside the grid
plt.legend(bbox_to_anchor=(1, 1));

## Effekt der gemessenen und Vorhergesagten Temperatur auf den UVI:

In [None]:
# check the proportions of the AirTemperature categories
df_Solys_UVI = pd.concat([df_UVI[['UVI', 'SZA']], df_Solys], axis = 1)

df_Solys_UVI.loc[:, "Temp"] = pd.cut(df_Solys_UVI.Temp,
                                         bins=[-np.inf, 0, 15, 20, 25, 30, np.inf], # get the bins for a total of 6 labels
                                         labels=['Very Cold', 'Cold', 'Standard', 'Ordinary', 'Room Temp', 'Moderate'])\
                                    .cat.remove_unused_categories() # remove the empty categories

sns.set()  # set the seaborn theme to default
sns.barplot(data=df_Solys_UVI.Temp.value_counts(normalize=True).to_frame().T)  # plotting a barplot

# set the labels and title
plt.xlabel("Air Temperature")
plt.ylabel("Proportions")
plt.title("Distribution of Air Temperature");

In [None]:
# Effect of Air Temperature on Power Generation by Sunlight
sns.set()

plt.subplots(figsize=(15, 4))  # set the figure dimensions
sns.boxplot(data=df_Solys_UVI, x="Temp", hue="Glo", y="UVI")  # plot a boxplot with no outliers

# set the labels and title
plt.xlabel("Air Temperature")
plt.ylabel("UVI")
plt.title("Effect of Air Temperature on UVI by Solar Radiation");

## Korrelation zwischen der Strahlungsmessung und dem UVI

In [None]:
df_Solys_UVI_corr = pd.concat([df_UVI[['UVI', 'SZA']], df_Solys], axis = 1)

sns.set()  # set the seaborn's theme as default
g = sns.pairplot(df_Solys_UVI_corr)

# set the title
g.fig.suptitle("Pairwise Relations between Features", y=1.03);

In [None]:
sns.scatterplot(data=df_Solys_UVI_corr, x='UVI', y='Glo')
plt.show()

## Finden von Nullwerten in den UVI Daten

In [None]:
# Zeilen an denen die Solys Daten NaN-Einträge besitzen
df_Solys_UVI_corr[df_Solys_UVI_corr.isnull().any(axis=1)]

In [None]:
df_Solys_UVI_corr