In [None]:
import pandas as pd
import os
import datetime as dt
import numpy as np

## Dataset Description
on
col      umn name descript                       ion     uom type format
STATION S_ID DWD Statio                                 n ID NUMBER
MESS_  DATUM reference date for the measur             ement NUMBER YYYYMMDD
      QN_592 the code of the quality level reflecs the 
quality control procedure applied or t e 
data
VARCHAR2 umerical 
code
 ATMO_STRAHL longwave downwar            dradia  ion J
/cm^2
NUMBER 999999999   990
FD_STRAHL daily sum of diffuse s   oar ra  iation J
/cm^2
NUMBER 999999   999990
FG_STRAHL daily sum of solar i  noming  radiation J
/cm^2
NUMBER 999   999999990
SD_STRAHL daily sum          of       sunshine duration h NUMBER 9990

In [None]:
import requests, zipfile
from io import BytesIO
from io import StringIO

# data source: https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/daily/solar/

# overview of DWD stations
url = 'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/daily/solar/ST_Tageswerte_Beschreibung_Stationen.txt'

df_dwd = pd.read_fwf(url,encoding = "ISO-8859-1",colspecs=[(0,5),(6,14),(15,23),(36,38),(43,51),(53,60),(61,84),(142,165)])[1:]
df_dwd.columns = ['Stations_id', 'von_datum', 'bis_datum', 'Stationshoehe', 'geoBreite', 'geoLaenge', 'Stationsname', 'Bundesland']
# getting id's for stations in Bavaria and with data within targeted timeframe
mask = (df_dwd.loc[:,'Bundesland'] == 'Bayern') & (df_dwd.loc[:,'von_datum'] < '20150101') & (df_dwd.loc[:,'bis_datum'] >= '20231231')
stations = list(df_dwd.loc[mask,'Stations_id'])

# creating empty Dataframe
df_stations = pd.DataFrame({'date': pd.date_range(start='1/1/2023', freq='1d', periods=365)})

# data gathering
for station in stations:
    req = requests.get('https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/daily/solar/tageswerte_ST_'+station+'_row.zip')
    zip = zipfile.ZipFile(BytesIO(req.content))  
    f = zip.open(zip.namelist()[-1])
    content = f.read()
    s=str(content,'utf-8')
    data = StringIO(s) 
    df_stat = pd.read_csv(data,sep=';',parse_dates=['MESS_DATUM'],usecols=['MESS_DATUM','FD_STRAHL','FG_STRAHL','SD_STRAHL'])
    mask = (df_stat.loc[:,'MESS_DATUM'] >= '2023-01-01') & (df_stat.loc[:,'MESS_DATUM'] <= '2023-12-31')
    df_stat.columns =['date',station+'_FD_STRAHL',station+'_FG_STRAHL',station+'_SD_STRAHL']
    df_stations = df_stations.merge(df_stat.loc[mask,:],on='date')


df_stations.head()

In [None]:
# data source: https://netztransparenz.tennet.eu/de/strommarkt/transparenz/transparenz-deutschland/netzkennzahlen/tatsaechliche-und-prognostizierte-solarenergieeinspeisung/bayern/
# manually filtered for 2023-01-01 to 2023-12-31

df_en_raw = pd.read_table('../data/solarEnergyFeedIn_BY_2023-01-01_2023-12-31.csv',sep=';',parse_dates=['Datum'],decimal=',')

df_en = pd.DataFrame({'date': pd.date_range(start='1/1/2023', freq='1d', periods=365)})
df_en.loc[:,'Prog_in_MW'] = pd.DataFrame(df_en_raw.groupby(['Datum'], as_index=False)['Prognostiziert in MW'].sum()).iloc[:,1]
df_en.loc[:,'Act_in_MW'] = pd.DataFrame(df_en_raw.groupby(['Datum'], as_index=False)['Tatsaechlich in MW'].sum()).iloc[:,1]
df_en.head()

In [None]:
# data source: http://www.marktstammdatenregister.de/MaStR/Einheit/Einheiten/OeffentlicheEinheitenuebersicht?filter=Inbetriebnahmedatum%20der%20Einheit~lt~%2701.01.2024%27~and~Inbetriebnahmedatum%20der%20Einheit~gt~%2731.12.2014%27~and~Bundesland~eq~%271403%27~and~Energietr%C3%A4ger~eq~%272495%27
# filters for Bavaria, solar energy and targeted timeframe already applied
# 465169 data points in 94 csv files, manually downloaded on March 08, 2024 (daily changes to data occur)

def read_and_combine_files(start, end, folder_path):
    # list for caching extracted data
    dfs = []

    # going through downloaded files
    for i in range(start, end + 1, 5000):
        # Erstelle den Dateinamen basierend auf dem Nummernbereich
        file_name = f"Stromerzeuger_{i}_bis_{i + 4999}.csv" 

        # Erstelle den vollständigen Pfad zur CSV-Datei
        file_path = os.path.join(folder_path, file_name)

        # Lese das CSV in einen DataFrame ein und füge ihn zur Liste hinzu
        df = pd.read_csv(file_path, delimiter=';')
        dfs.append(df)

    # Lese die letzte Datei ein
    last_file_path = os.path.join(folder_path, 'Stromerzeuger_465001_bis_465169.csv')
    last_df = pd.read_csv(last_file_path, delimiter=';')

    # Füge die letzte Datei zum DataFrame hinzu
    dfs.append(last_df)

    # Kombiniere alle DataFrames nach dem Index
    combined_df = pd.concat(dfs, axis=0, ignore_index=True)
    return combined_df

folder_path = r'../data/'

combined_df = read_and_combine_files(1, 464999, folder_path)

# creating empty Dataframe
df_modules = pd.DataFrame({'date': pd.date_range(start='1/1/2015', end='12/31/2023', freq='1d')})

# Nur die Zeilen filtern, die den 'Betriebs-Status' 'In Betrieb' haben
mask_combined_df = (combined_df['Betriebs-Status'] == 'In Betrieb')
df_combined_filtered = combined_df.loc[mask_combined_df, :].copy()  # Kopie erstellen

# 'Inbetriebnahmedatum der Einheit' in Datumstyp konvertieren
df_combined_filtered['Inbetriebnahmedatum der Einheit'] = pd.to_datetime(df_combined_filtered['Inbetriebnahmedatum der Einheit'], format="%d.%m.%Y", errors='coerce')

# Gruppensummen für 'Bruttoleistung der Einheit' und 'Nettonennleistung der Einheit' basierend auf 'Inbetriebnahmedatum der Einheit' berechnen
grouped_sum = df_combined_filtered.groupby('Inbetriebnahmedatum der Einheit').agg({
    'Bruttoleistung der Einheit': lambda x: x.str.replace(',', '.').astype(float).sum(),
    'Nettonennleistung der Einheit': lambda x: x.str.replace(',', '.').astype(float).sum()
}).reset_index()

# Spaltennamen ändern
grouped_sum = grouped_sum.rename(columns={'Bruttoleistung der Einheit': 'Bruttoleistung', 'Nettonennleistung der Einheit': 'Nettoleistung'})

# Gruppensummen in den DataFrame df_combined einfügen
df_modules = df_modules.merge(grouped_sum, left_on='date', right_on='Inbetriebnahmedatum der Einheit', how='left').fillna(0)

# Ergebnisse anzeigen
df_modules = df_modules.drop(columns=['Inbetriebnahmedatum der Einheit'])  # Optional: Entfernen Sie die zusätzliche Spalte
df_modules.head()

In [None]:
# alternatively collecting data from 1 downloaded file:
# data source: https://download.marktstammdatenregister.de/Gesamtdatenexport_20240306_23.2.zip (or any current version from 'Gesamtdatenauszug vom Vortag' at https://www.marktstammdatenregister.de/MaStR/Datendownload)
# files: EinheitenSolar_1-39.xml, already unpacked in /data


#import xml.etree.ElementTree as ET
#
#files = list(range(1,40)) # manually setting numbers 1-39
# creating empty dataframe
#df_mod_raw = pd.DataFrame(columns=['Datum', 'PLZ', 'Bruttoleistung','Nettonennleistung','Inbetriebnahme'])
#counter = 0
#for file in files:
#    source = 'data/EinheitenSolar_'+str(file)+'.xml'
#    tree = ET.parse(source)
#    root = tree.getroot()
#    
#    for einheit in root.findall('EinheitSolar'):
#        try: 
#            land = einheit.find('Bundesland').text
#        except:
#            land = 'Unbekannt'
#        if land == '1403': # Bundesland 1403 = Bayern
#            try: 
#                inbetrieb = einheit.find('Inbetriebnahmedatum').text
#            except:
#                inbetrieb = 'Ausser_Betrieb'
#            
#            if inbetrieb.startswith(('2015','2016','2017','2018','2019','2020','2021','2022','2023')):  
#                plz = einheit.find('Postleitzahl').text 
#                brutto = float(einheit.find('Bruttoleistung').text)
#                netto = float(einheit.find('Nettonennleistung').text)
#                datum = einheit.find('DatumLetzteAktualisierung').text
#        
#                datum = datum[:10]
#                df_mod_raw.loc[counter] = [datum,plz,brutto,netto,inbetrieb]
#                counter = counter + 1

In [None]:
# merging dataframes
df_final_raw_2015_2023 = df_modules.merge(df_modules,df_en, on = 'date')
df_final_raw_2015_2023 = df_final_raw_2015_2023.merge(df_stations, on = 'date')
#df_final_raw_2015_2023.to_csv('../CSV/df_final_raw_2015_2023.csv')

In [None]:
# first EDA of raw dataframe
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

df_raw = pd.read_csv('../CSV/df_final_raw_2015_2023.csv',index_col=0,parse_dates=['date'])
df_raw.info()
df_raw.head()

In [None]:
df_raw.describe()

In [None]:
df_raw.plot(x='date',y='05792_FD_STRAHL',figsize=(10,6)) #example of stations-file for -999 values

In [None]:
df_raw.loc[:,'05792_FD_STRAHL'].plot(kind='hist',figsize=(10,6),bins=100)
# values to be imputed: -999, 0 , since no solar radiation at all seems physically not plausible

In [None]:
df_raw.plot(x='date',y='03668_SD_STRAHL',figsize=(10,6))

In [None]:
df_raw.loc[:,'03668_SD_STRAHL'].plot(kind='hist',figsize=(10,6),bins=100)
# values to be imputed: -999

In [None]:
df_raw.plot(x='date',y='Act_in_MW',figsize=(10,6)) 

In [None]:
df = pd.read_csv('../CSV/df_final_raw_2015_2023.csv',index_col=0,parse_dates=['date'])

df = df.drop('Prog_in_MW',axis=1) # similar to target column, high danger of data leakage

cols_sd =[col for col in df.columns if 'SD' in col]
for col in cols_sd:
    # setting values < 0 (i.a. -999) to NaN
    df.loc[(df.loc[:,col] < 0),col] = pd.NA

cols_fd_fg = [col for col in df.columns if ('FD' in col)|('FG' in col)]
for col in cols_fd_fg:
    # setting values < 1 (-999 and 0) to NaN
    df.loc[(df.loc[:,col] < 1),col] = pd.NA

df.info()
display(df.head())


In [None]:
fig,ax=plt.subplots(figsize=(15,15))
sns.heatmap(df.corr(),cmap="coolwarm",annot=True,ax=ax)
plt.xticks(rotation=-45)  # Rotate x-axis labels for readability

plt.tight_layout()  # Adjust spacing for readability
plt.show()

In [None]:
display(df.plot(x='date',y='05792_FD_STRAHL',figsize=(10,6))) #with NaNs

# imputing implausible values
r, c = np.where(df.iloc[:,:-3].isna())
for i in range(len(r)):
    # getting all SD, FG or FD columns if applicable (identical physical measurements in Bavaria)
    cols_na = [col for col in df.columns if col.endswith(str(df.columns[c[i]])[5:])]

    # getting the median of specific day from corresponding columns
    med_row = df.loc[(df.index==r[i]) & (df['date'].dt.day==(df.iloc[r[i],0].day)) & (df['date'].dt.month==(df.iloc[r[i],0].month)),cols_na].sum().median()

    # getting the median of NaN column for specific day of the year
    med_col = df.loc[(df['date'].dt.day==(df.iloc[r[i],0].day)) & (df['date'].dt.month==(df.iloc[r[i],0].month)),df.columns[c[i]]].median()
    
    # calculating mean of both medians
    mean_v = np.nanmean(np.array([med_col,med_row]))

    # imputing
    df.iloc[r[i],c[i]] = mean_v
    
display(df.plot(x='date',y='05792_FD_STRAHL',figsize=(10,6))) #with imputed values 
df.isna().sum()

In [None]:
# manually gathered from http://www.marktstammdatenregister.de/MaStR/Einheit/Einheiten/OeffentlicheEinheitenuebersicht?filter=Inbetriebnahmedatum%20der%20Einheit~lt~%2701.01.2015%27~and~Energietr%C3%A4ger~eq~%272495%27~and~Bundesland~eq~%271403%27
# brutto vor 2015: 10.941.816 kW
# netto vor 2015: 10.135.888 kW

# creating new features for cumulative area of solarmodules for each day
df.loc[:,'Bruttoleistung_kumulativ'] = df.loc[:,'Bruttoleistung'].cumsum()+10941816
df.loc[:,'Nettoleistung_kumulativ'] = df.loc[:,'Nettoleistung'].cumsum()+10135888

# brutto vor 2024: 22.454.441 kW
# netto vor 2024: 20.404.484 kW
print('Difference in Brutto MW of extracted data and direct information from website:',22454441-round(df.iloc[-1,-2]),'(',round((22454441-round(df.iloc[-1,-2]))/22454441,3),'%)')
print('Difference in Netto MW of extracted data and direct information from website:',20404484-round(df.iloc[-1,-1]),'(',round((20404484-round(df.iloc[-1,-1]))/20404484,3),'%)')

# development of area of solarmodules from 2015-2023
df.plot(x='date',y='Bruttoleistung_kumulativ',figsize=(10,6)) 

#df.to_csv('../CSV/df_solar_energy_2015_2023.csv')

In [None]:
# EDA for timeseries

df_ts = df.set_index('date').copy()
df_ts.head()

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
# Assuming 'Act_in_MW' is the column you want to decompose
# Perform seasonal decomposition
decomposition = seasonal_decompose(df_ts['Act_in_MW'], period=365)  

# Plot the decomposed components
fig = plt.figure()  
fig = decomposition.plot()  
plt.show()

In [None]:
from statsmodels.tsa.stattools import adfuller

def test_stationarity(timeseries):
    # Determing rolling statistics
    rolmean = timeseries.rolling(window=52).mean()
    rolstd = timeseries.rolling(window=52).std()

    # Plot rolling statistics
    fig = plt.figure(figsize=(12, 8))
    orig = plt.plot(timeseries, color='blue', label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label='Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show()

    # Perform Dickey-Fuller test
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used'])
    for key, value in dftest[4].items():
        dfoutput['Critical Value (%s)' % key] = value
    print(dfoutput)

# Beispielaufruf mit der Spalte 'Act_in_MW'
test_stationarity(df_ts['Act_in_MW'])

In [None]:
df_ts['first_difference'] = df_ts['Act_in_MW'] - df_ts['Act_in_MW'].shift(1)
test_stationarity(df_ts['first_difference'].dropna(inplace = False))

In [None]:
# Beispiel: Berechnung der ersten Differenz
df_ts['first_difference'] = df_ts['Act_in_MW'].diff()

# Beispiel: Berechnung der saisonalen Differenz
df_ts['seasonal_first_difference'] = df_ts['first_difference'] - df_ts['first_difference'].shift(365)

# Test auf Stationarität
test_stationarity(df_ts['seasonal_first_difference'].dropna(inplace=False))

In [None]:
import statsmodels.api as sm
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(df_ts.seasonal_first_difference.iloc[366:], lags=20, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(df_ts.seasonal_first_difference.iloc[366:], lags=20, ax=ax2)

In [None]:
#todos:
#add markdown text for explanation
#pyplot for figures, improve labeling

#reorganize folders in repo (/data/)