In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_classif, f_regression, mutual_info_regression
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

**Importieren und Untersuchung der Trainingsdaten**

In [8]:
data_df = pd.read_csv('2019_Trainingsdaten_1h.csv')
print("Shape (rows, columns):", data_df.shape)

# Checking for missing values using isna() or isnull()
missing_values = data_df.isna().sum() + data_df.isnull().sum()

print("Number of missing values in each column:")
print(missing_values)

Shape (rows, columns): (43800, 17)
Number of missing values in each column:
MESS_DATUM     0
RWS_DAU_10     0
RWS_10         0
DS_10          0
GS_10          0
SD_10          0
FF_10          0
DD_10          0
PP_10          0
TT_10          0
TM5_10         0
RF_10          0
load           0
Weekday        0
Weekend        0
Month          0
Hour_of_Day    0
dtype: int64


**Trennen der unabhängigen und abhängigen Variablen für das Modelltraining**

In [None]:
x = data_df.drop(['load', 'MESS_DATUM'], axis=1)  # Keep x as a DataFrame
y = data_df['load'].values

**Recursive Feature Elimination (RFE)**<br>
Bei der [RFE (scikit-learn)](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html) wird das Model zuerst mit der ganzen Menge an vorhandenen Features trainiert, dann wird das Feature mit der geringsten Relevanz entfernt und die RFE-Methode wiederholt.

In [None]:
feature_names = x.columns.tolist()
model = RandomForestRegressor()

# Instantiate RFE to select top k features
num_features_to_select = 8
rfe_selector = RFE(model, n_features_to_select=num_features_to_select)

# Fit feature selector to data
rfe_selector = rfe_selector.fit(x, y)

# Get the mask of selected features
selected_features_mask = rfe_selector.support_

# Get the names of the selected features from the original feature names
selected_features = [feature_names[i] for i, selected in enumerate(selected_features_mask) if selected]

# Get the ranking of features (optional)
feature_ranking = rfe_selector.ranking_
feature_ranking_df = pd.DataFrame({'Feature': feature_names, 'Ranking': feature_ranking})
feature_ranking_df = feature_ranking_df.sort_values(by='Ranking')

print("Feature ranking:")
print(feature_ranking_df)

print("Selected features:")
print(selected_features)

**Modell-Training**<br>
Trainingsdaten: 80%<br>
Validierungsdaten: 20%

In [29]:
# Assuming data_df is your DataFrame containing date-time strings and load values
# Extract numerical features from date-time, e.g., year, month, day, hour, minute
#data_df['rec_time'] = pd.to_datetime(data_df['rec_time'])
#data_df['year'] = data_df['rec_time'].dt.year
#data_df['month'] = data_df['rec_time'].dt.month
#data_df['day'] = data_df['rec_time'].dt.day
#data_df['hour'] = data_df['rec_time'].dt.hour

# Define the independent and dependent variables
X = data_df.drop(['load', 'MESS_DATUM'], axis=1)  # Keep x as a DataFrame
#print(X)
#X = data_df[['year', 'month', 'day', 'hour']].values
y = data_df['load'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a RandomForestRegressor model
random_forest = RandomForestRegressor(n_estimators=15)

# Fit the model on the training data
random_forest.fit(X_train, y_train)

# Make predictions on the test set
predictions = random_forest.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, predictions)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2 score)
r2 = r2_score(y_test, predictions)

print("RandomForestRegressor")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2 score): {r2}")

RandomForestRegressor
Mean Squared Error (MSE): 67543.53171475706
Root Mean Squared Error (RMSE): 259.89138445657846
R-squared (R2 score): 0.07914003300097772


**Linear Regression**

In [32]:
# Define the independent and dependent variables
X = data_df.drop(['load', 'MESS_DATUM'], axis=1)  # Keep x as a DataFrame
#print(X)
#X = data_df[['year', 'month', 'day', 'hour']].values
y = data_df['load'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression() 
model.fit(X_train,y_train)
predictions = model.predict(X_test) 

# model evaluation 
print('mean_squared_error : ', mean_squared_error(y_test, predictions)) 
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))
print('r2_score : ',r2_score(y_test, predictions))

mean_squared_error :  60195.49047093228
mean_absolute_error :  183.47341661953482
r2_score :  0.17932012198228253


**AdaBoostRegressor**

In [6]:
# Define the independent and dependent variables
#X = data_df[['year', 'month', 'day', 'hour', 'minute']].values
#X = data_df[['month', 'day', 'hour']].values
X = data_df.drop(['load', 'MESS_DATUM'], axis=1)  # Keep x as a DataFrame
#print(X)
y = data_df['load'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an AdaBoostRegressor model
ada_boost = AdaBoostRegressor(n_estimators=100, learning_rate=1.0, random_state=42)

# Fit the model on the training data
ada_boost.fit(X_train, y_train)

# Make predictions on the test set
predictions = ada_boost.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, predictions)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2 score)
r2 = r2_score(y_test, predictions)

print("AdaBoostRegressor")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2 score): {r2}")

AdaBoostRegressor
Mean Squared Error (MSE): 61855.778328636996
Root Mean Squared Error (RMSE): 248.70821926232554
R-squared (R2 score): 0.15668445898036099
