In [None]:
# EDA of raw dataframe
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

df_raw = pd.read_csv('../Dataset/preprocessed_data/df_final_raw_2015_2023.csv',index_col=0,parse_dates=['date'])
df_raw.info()
df_raw.head()

In [None]:
df_raw.describe()

In [None]:
df_raw.plot(x='date',y='03668_FD_STRAHL',figsize=(10,5)) #example of stations-file for -999 values

In [None]:
df_raw.loc[:,'05792_FD_STRAHL'].plot(kind='hist',figsize=(10,5),bins=100)
# values to be imputed: -999, 0 , since no solar radiation at all seems physically not plausible

In [None]:
df_raw.plot(x='date',y='03668_SD_STRAHL',figsize=(10,5))

In [None]:
df_raw.loc[:,'03668_SD_STRAHL'].plot(kind='hist',figsize=(10,5),bins=100)
# values to be imputed: -999

In [None]:
df_raw.plot(x='date',y='Act_in_MW',figsize=(10,5)) 

In [None]:
df_raw.plot(x='date',y='Bruttoleistung',figsize=(10,5)) 

In [None]:
df = pd.read_csv('../CSV/df_final_raw_2015_2023.csv',index_col=0,parse_dates=['date'])

df = df.drop('Prog_in_MW',axis=1) # similar to target column, high danger of data leakage

cols_sd =[col for col in df.columns if 'SD' in col]
for col in cols_sd:
    # setting values < 0 (i.a. -999) to NaN
    df.loc[(df.loc[:,col] < 0),col] = pd.NA

cols_fd_fg = [col for col in df.columns if ('FD' in col)|('FG' in col)]
for col in cols_fd_fg:
    # setting values < 1 (-999 and 0) to NaN
    df.loc[(df.loc[:,col] < 1),col] = pd.NA

df.info()
display(df.head())


In [None]:
cols_fg =[col for col in df.columns if 'FG' in col]
display(df.loc[(df['date'].dt.day==1) & (df['date'].dt.month==1),cols_fg])
print(df.loc[(df['date'].dt.day==1) & (df['date'].dt.month==1),'03668_FG_STRAHL'].median())
print(df.loc[(df.index==365) & (df['date'].dt.day==1) & (df['date'].dt.month==1),cols_fg].sum().median())
((df.loc[(df['date'].dt.day==1) & (df['date'].dt.month==1),'03668_FG_STRAHL'].median())+(df.loc[(df.index==365) & (df['date'].dt.day==1) & (df['date'].dt.month==1),cols_fg].sum().median()))/2

In [None]:
fig,ax=plt.subplots(figsize=(15,15))
sns.heatmap(df.corr(),cmap="coolwarm",annot=True,ax=ax)
plt.xticks(rotation=-45)  # Rotate x-axis labels for readability

plt.tight_layout()  # Adjust spacing for readability
plt.show()

In [None]:
display(df.plot(x='date',y='05792_FG_STRAHL',figsize=(10,5))) #with NaNs

# imputing implausible values
r, c = np.where(df.iloc[:,:-3].isna())
for i in range(len(r)):
    # getting all SD, FG or FD columns if applicable (identical physical measurements in Bavaria)
    cols_na = [col for col in df.columns if col.endswith(str(df.columns[c[i]])[5:])]

    # getting the median of specific day from corresponding columns
    med_row = df.loc[(df.index==r[i]) & (df['date'].dt.day==(df.iloc[r[i],0].day)) & (df['date'].dt.month==(df.iloc[r[i],0].month)),cols_na].sum().median()

    # getting the median of NaN column for specific day of the year
    med_col = df.loc[(df['date'].dt.day==(df.iloc[r[i],0].day)) & (df['date'].dt.month==(df.iloc[r[i],0].month)),df.columns[c[i]]].median()
    
    # calculating mean of both medians
    mean_v = np.nanmean(np.array([med_col,med_row]))

    # imputing
    df.iloc[r[i],c[i]] = mean_v
    
display(df.plot(x='date',y='05792_FG_STRAHL',figsize=(10,5))) #with imputed values 
df.isna().sum()

In [None]:
# manually gathered from http://www.marktstammdatenregister.de/MaStR/Einheit/Einheiten/OeffentlicheEinheitenuebersicht?filter=Inbetriebnahmedatum%20der%20Einheit~lt~%2701.01.2015%27~and~Energietr%C3%A4ger~eq~%272495%27~and~Bundesland~eq~%271403%27
# brutto vor 2015: 10.941.816 kW
# netto vor 2015: 10.135.888 kW

# creating new features for cumulative area of solarmodules for each day
df.loc[:,'Bruttoleistung_kumulativ'] = df.loc[:,'Bruttoleistung'].cumsum()+10941816
df.loc[:,'Nettoleistung_kumulativ'] = df.loc[:,'Nettoleistung'].cumsum()+10135888

# brutto vor 2024: 22.454.441 kW
# netto vor 2024: 20.404.484 kW
print('Difference in Brutto MW of extracted data and direct information from website:',22454441-round(df.iloc[-1,-2]),'(',round((22454441-round(df.iloc[-1,-2]))/22454441,3),'%)')
print('Difference in Netto MW of extracted data and direct information from website:',20404484-round(df.iloc[-1,-1]),'(',round((20404484-round(df.iloc[-1,-1]))/20404484,3),'%)')

# development of area of solarmodules from 2015-2023
df.plot(x='date',y='Bruttoleistung_kumulativ',figsize=(10,5)) 

df.to_csv('../Dataset/preprocessed_data/df_solar_energy_2015_2023.csv')

In [None]:
df_raw.plot(x='date',y='Act_in_MW',figsize=(10,5)) 

In [None]:
summer = (df['date'].dt.month <= 8) & (df['date'].dt.month >= 6)
df.loc[:,'date']=df.loc[:,'date'].map(pd.Timestamp.toordinal)
fig, ax1 = plt.subplots(figsize=(10,5))
df_s = df.loc[summer,:]
sns.regplot(data=df_s, x='date', y='Act_in_MW', ax=ax1, color='magenta', scatter_kws={'s': 2})
df.plot.scatter(x='date',y='Act_in_MW',ax=ax1,alpha=0.2)
ax1.set_xlim(df.iloc[0,0], df.iloc[-1,0])
ax2 = ax1.twinx()
df.plot(x='date',y='Bruttoleistung_kumulativ',ax=ax2,color='black') 

xticks = ax1.get_xticks()

labels = [pd.Timestamp.fromordinal(int(label)).date() for label in xticks]
ax1.set_xticks(xticks)
ax1.xaxis.set_tick_params(labelrotation=45)
ax1.set_xticklabels(labels)

fig.tight_layout() 
plt.show()