In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

### Pre-processing

In [None]:
# Loading data from CSV file
df_A005 = pd.read_csv('complete_A005.csv')
df_A006 = pd.read_csv('complete_A006.csv')
df_A028 = pd.read_csv('complete_A028.csv')
#df_A029 = pd.read_csv('complete_A029.csv')
df_A030 = pd.read_csv('complete_A030.csv')

# Splitting datetime column to seperate date and time
datetime_column = 'Intervallbeginn (Lokalzeit)'

df_A005[datetime_column] = pd.to_datetime(df_A005[datetime_column], format='%d.%m.%Y %H:%M:%S')
df_A006[datetime_column] = pd.to_datetime(df_A006[datetime_column], format='%d.%m.%Y %H:%M:%S')
df_A028[datetime_column] = pd.to_datetime(df_A028[datetime_column], format='%d.%m.%Y %H:%M:%S')
#df_A029[datetime_column] = pd.to_datetime(df_A029[datetime_column], format='%d.%m.%Y %H:%M:%S')
df_A030[datetime_column] = pd.to_datetime(df_A030[datetime_column], format='%d.%m.%Y %H:%M:%S')

#df_A005['Date'] = df_A005[datetime_column].dt.date
#df_A005['Time'] = df_A005[datetime_column].dt.time
#df_A006['Date'] = df_A006[datetime_column].dt.date
#df_A006['Time'] = df_A006[datetime_column].dt.time
#df_A028['Date'] = df_A028[datetime_column].dt.date
#df_A028['Time'] = df_A028[datetime_column].dt.time
#df_A029['Date'] = df_A029[datetime_column].dt.date
#df_A029['Time'] = df_A029[datetime_column].dt.time
#df_A030['Date'] = df_A030[datetime_column].dt.date
#df_A030['Time'] = df_A030[datetime_column].dt.time

Taking hourly sum and splitting Datetime info to model-readable format

In [None]:
df_A005.rename({'Intervallbeginn (Lokalzeit)': 'Datetime'}, axis=1, inplace=True)
df_A005.set_index('Datetime', inplace=True)

df_A006.rename({'Intervallbeginn (Lokalzeit)': 'Datetime'}, axis=1, inplace=True)
df_A006.set_index('Datetime', inplace=True)

df_A028.rename({'Intervallbeginn (Lokalzeit)': 'Datetime'}, axis=1, inplace=True)
df_A028.set_index('Datetime', inplace=True)

df_A030.rename({'Intervallbeginn (Lokalzeit)': 'Datetime'}, axis=1, inplace=True)
df_A030.set_index('Datetime', inplace=True)

# Resample the data to calculate the hourly sum
# A005
df_A005 = df_A005.resample('H').sum()
df_A005.reset_index(inplace=True)
df_A005.fillna(0, inplace=True)

# Resample the data to year, month, day, hour

df_A005['Year'] = df_A005['Datetime'].dt.year
df_A005['Month'] = df_A005['Datetime'].dt.month
df_A005['Day'] = df_A005['Datetime'].dt.day
df_A005['Hour'] = df_A005['Datetime'].dt.hour
# A006
df_A006 = df_A006.resample('H').sum()
df_A006.reset_index(inplace=True)
df_A006.fillna(0, inplace=True)
df_A006['Year'] = df_A006['Datetime'].dt.year
df_A006['Month'] = df_A006['Datetime'].dt.month
df_A006['Day'] = df_A006['Datetime'].dt.day
df_A006['Hour'] = df_A006['Datetime'].dt.hour
# A028
df_A028 = df_A028.resample('H').sum()
df_A028.reset_index(inplace=True)
df_A028.fillna(0, inplace=True)
df_A028['Year'] = df_A028['Datetime'].dt.year
df_A028['Month'] = df_A028['Datetime'].dt.month
df_A028['Day'] = df_A028['Datetime'].dt.day
df_A028['Hour'] = df_A028['Datetime'].dt.hour
# A030
df_A030 = df_A030.resample('H').sum()
df_A030.reset_index(inplace=True)
df_A030.fillna(0, inplace=True)
df_A030['Year'] = df_A030['Datetime'].dt.year
df_A030['Month'] = df_A030['Datetime'].dt.month
df_A030['Day'] = df_A030['Datetime'].dt.day
df_A030['Hour'] = df_A030['Datetime'].dt.hour

Saving the pre-processed data

In [None]:
df_A005.to_csv('data_A005.csv', index=False)
df_A006.to_csv('data_A006.csv', index=False)
df_A028.to_csv('data_A028.csv', index=False)
#df_A029.to_csv('data_A029.csv', index=False)
df_A030.to_csv('data_A030.csv', index=False)