In [12]:
import os
import subprocess

# Set the working directory to the root of the Git repository
current_dir = os.getcwd()
git_root = subprocess.check_output(["git", "rev-parse", "--show-toplevel"], cwd=current_dir)
git_root = git_root.decode("utf-8").strip()
os.chdir(git_root)
cwd = os.getcwd()

In [27]:
import pandas as pd
import numpy as np
import math
from datetime import date
import calendar
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from torch.utils.data import DataLoader, TensorDataset
import joblib
from sklearn.preprocessing import OneHotEncoder

### Read Data

In [14]:
df = pd.read_parquet('data/processed/data_building_n.parquet')

def project_date_to_unit_circle(input_date: date):
    year = input_date.year
    passed_days = (input_date - date(year, 1, 1)).days + 1
    nr_of_days_per_year = 366 if calendar.isleap(year) else 365
    position_within_year = passed_days / nr_of_days_per_year
    alpha = position_within_year * math.pi * 2
    year_circle_x = (math.sin(alpha) + 1) / 2
    year_circle_y = (math.cos(alpha) + 1) / 2
    return year_circle_x, year_circle_y

def project_day_of_week_to_unit_circle(input_day_of_week: int):
    alpha = input_day_of_week / 7 * math.pi * 2
    day_of_week_circle_x = (math.sin(alpha) + 1) / 2
    day_of_week_circle_y = (math.cos(alpha) + 1) / 2
    return day_of_week_circle_x, day_of_week_circle_y

# Project the date to a unit circle (year)
df['date_circle_x'], df['date_circle_y'] = zip(*df['date'].apply(project_date_to_unit_circle))

# Project the day_of_week to a unit circle (week)
df['day_of_week_circle_x'], df['day_of_week_circle_y'] = zip(*df['day_of_week'].apply(project_day_of_week_to_unit_circle))

#----- One hot encoding ----------------------------------
encoder = OneHotEncoder(sparse=False)
encoded_columns = encoder.fit_transform(df[['season', 'floor']])

# Neue Spaltennamen für One-Hot-Encoded-Spalten generieren
encoded_column_names = encoder.get_feature_names_out(['season', 'floor'])

# One-Hot-Encoded-Spalten in den DataFrame einfügen
df_encoded = pd.DataFrame(encoded_columns, columns=encoded_column_names)
df = pd.concat([df, df_encoded], axis=1).drop(['season', 'floor'], axis=1)

### Resampling

In [15]:
#----- Resampling ----------------------------------------
df.set_index('date_time', inplace=True)
df_daily = df.groupby('room').resample('D').mean().dropna()

df_daily.reset_index(inplace=True)
df_daily.set_index(['date_time'], inplace=True)

df_daily = df_daily[[
    'room', 'tmp', 'hum', 'CO2', 'VOC', 'outside_tmp', 'outside_hum', 'outside_rain',
    'outside_snowfall', 'outside_wind_speed', 'outside_pressure',
    'date_circle_x', 'date_circle_y', 'day_of_week_circle_x',
    'day_of_week_circle_y', 'season_autumn', 'season_spring',
    'season_summer', 'season_winter', 'floor_0', 'floor_1', 'floor_2',
    'floor_3'
    ]]

### Create Input Data

In [16]:
def create_sequences(data, features, sequence_length):
    sequences = []
    targets = []
    
    for room in data['room'].unique():
        df_room = data[data['room'] == room].reset_index()
        df_room.sort_values('date_time', inplace=True)
        for i in range(len(df_room) - sequence_length):
            # Prüfe, ob die Tage aufeinanderfolgend sind
            if (df_room.loc[i + sequence_length - 1, 'date_time'] - df_room.loc[i, 'date_time']).days == sequence_length - 1:
                sequences.append(df_room.loc[i:i+sequence_length-2, features].drop(columns=['room']).values)
                targets.append(df_room.loc[i + sequence_length -1, 'tmp'])
    
    return np.array(sequences), np.array(targets)

days_used_to_predict = 7 # e.g. when 7 is selected, 6 days are used to predict the 7th day
X, y = create_sequences(df_daily, df_daily.columns, days_used_to_predict)

### Train Test Split

In [17]:
n_bins = 5
kbins = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
y_binned = kbins.fit_transform(y.reshape(-1, 1)).astype(int).reshape(-1)

# Use stratified sampling to split the data into training and test sets with the same distribution of bins
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y_binned)

print("Train Mean:", np.mean(y_train), "Train Std Dev:", np.std(y_train))
print("Test Mean:", np.mean(y_test), "Test Std Dev:", np.std(y_test))

Train Mean: 22.35377949909938 Train Std Dev: 1.8675394707221238
Test Mean: 22.338341034378328 Test Std Dev: 1.866018212817115


### Scaling

In [18]:
x_scaler = MinMaxScaler()

# Forme die Daten um, damit sie vom Scaler verarbeitet werden können
n_samples, n_timesteps, n_features = X_train.shape
X_train_reshaped = X_train.reshape(-1, n_features)
X_test_reshaped = X_test.reshape(-1, n_features)

# Skalierung basierend auf dem Trainingsdatensatz
x_scaler.fit(X_train_reshaped)

# Wende die Skalierung auf Trainings- und Testdatensatz an
X_train_scaled = x_scaler.transform(X_train_reshaped).reshape(n_samples, n_timesteps, n_features)
X_test_scaled = x_scaler.transform(X_test_reshaped).reshape(X_test.shape[0], n_timesteps, n_features)

# Optional: Skalierung der Zielwerte (falls erforderlich)
y_scaler = MinMaxScaler()
y_train_scaled = y_scaler.fit_transform(y_train.reshape(-1, 1)).reshape(-1)
y_test_scaled = y_scaler.transform(y_test.reshape(-1, 1)).reshape(-1)

# Überprüfe die Formen der skalierten Daten
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)
print("y_train_scaled shape:", y_train_scaled.shape)
print("y_test_scaled shape:", y_test_scaled.shape)

X_train_scaled shape: (1562, 6, 22)
X_test_scaled shape: (391, 6, 22)
y_train_scaled shape: (1562,)
y_test_scaled shape: (391,)


['NeuralNetworks/models/2024-06-26_11-55-09/y_scaler.pkl']

In [19]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Convert data to float32
X_train_scaled = X_train_scaled.astype(np.float32)
X_test_scaled = X_test_scaled.astype(np.float32)
y_train_scaled = y_train_scaled.astype(np.float32)
y_test_scaled = y_test_scaled.astype(np.float32)

# Convert numpy arrays to tensors
X_train_scaled_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
X_test_scaled_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
y_train_scaled_tensor = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
y_test_scaled_tensor = torch.tensor(y_test_scaled, dtype=torch.float32).to(device)

In [20]:
batch_size = 32

# Create TensorDataset and DataLoader
train_dataset = TensorDataset(X_train_scaled_tensor, y_train_scaled_tensor)
test_dataset = TensorDataset(X_test_scaled_tensor, y_test_scaled_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define simple neural network
class SimpleLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1, dropout_rate=0.2):
        super(SimpleLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_rate)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_rate)
    
    def forward(self, x):
        h0 = torch.zeros(self.lstm.num_layers, x.size(0), self.lstm.hidden_size).to(x.device)
        c0 = torch.zeros(self.lstm.num_layers, x.size(0), self.lstm.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.dropout(out[:, -1, :])
        out = self.fc(out)
        return out

In [21]:
# Modellinstanziierung
input_size = X_train_scaled.shape[2]
hidden_size = 50
output_size = 1
num_layers = 1
dropout_rate = 0.2

model = SimpleLSTM(input_size, hidden_size, output_size, num_layers, dropout_rate).to(device)

# Loss und Optimizer definieren
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Listen zur Speicherung der Verluste
train_losses = []
test_losses = []

# Training des Modells
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        
        # Forward pass
        outputs = model(batch_x)
        loss = criterion(outputs.squeeze(), batch_y)
        
        # Backward pass und Optimierung
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    avg_train_loss = epoch_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    
    # Testverlust berechnen
    model.eval()
    test_loss = 0.0
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = model(batch_x)
            loss = criterion(outputs.squeeze(), batch_y)
            test_loss += loss.item()
    avg_test_loss = test_loss / len(test_loader)
    test_losses.append(avg_test_loss)
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Test Loss: {avg_test_loss:.4f}')

# Plotten der Verluste
epochs = list(range(1, num_epochs + 1))
fig = go.Figure()
fig.add_trace(go.Scatter(x=epochs, y=train_losses, mode='lines', name='Train Loss'))
fig.add_trace(go.Scatter(x=epochs, y=test_losses, mode='lines', name='Test Loss'))
fig.update_layout(title='Train and Test Loss over Epochs',
                   xaxis_title='Epoch',
                   yaxis_title='Loss')
fig.show()

# Rücktransformation der skalierten Vorhersagen und realen Werte (falls erforderlich)
scaler_y = MinMaxScaler()
scaler_y.fit(y_train_scaled.reshape(-1, 1))

all_predictions = []
model.eval()
with torch.no_grad():
    for batch_x, batch_y in test_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        outputs = model(batch_x)
        all_predictions.append(outputs.cpu().numpy())

all_predictions = np.concatenate(all_predictions, axis=0)

y_pred = scaler_y.inverse_transform(all_predictions.reshape(-1, 1)).flatten()
y_test_real = scaler_y.inverse_transform(y_test_scaled.reshape(-1, 1)).flatten()

# Plotten der realen Werte und Vorhersagen
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(y_test_real))), y=y_test_real, mode='lines', name='Real Values'))
fig.add_trace(go.Scatter(x=list(range(len(y_test_real))), y=y_pred, mode='lines', name='Predicted Values'))
fig.update_layout(title='Real vs Predicted Values',
                   xaxis_title='Sample Index',
                   yaxis_title='Value')
fig.show()



Epoch [10/100], Train Loss: 0.0024, Test Loss: 0.0015
Epoch [20/100], Train Loss: 0.0019, Test Loss: 0.0011
Epoch [30/100], Train Loss: 0.0015, Test Loss: 0.0010
Epoch [40/100], Train Loss: 0.0015, Test Loss: 0.0010
Epoch [50/100], Train Loss: 0.0013, Test Loss: 0.0009
Epoch [60/100], Train Loss: 0.0011, Test Loss: 0.0008
Epoch [70/100], Train Loss: 0.0010, Test Loss: 0.0008
Epoch [80/100], Train Loss: 0.0009, Test Loss: 0.0008
Epoch [90/100], Train Loss: 0.0010, Test Loss: 0.0007
Epoch [100/100], Train Loss: 0.0009, Test Loss: 0.0007


## Naive Approach
To get a baseline model, we will use a naive approach.
We will use the mean of the last 7 days as the prediction for the next day.

In [None]:
y_naive = []
for sequence in X_test:
    y_naive.append(sequence[:, 0].mean())

mse_naive = mean_squared_error(y_test, y_naive)
print(f'Mean Squared Error using last day values: {mse_naive:.2f}')

Mean Squared Error using last day values: 0.25


In [22]:
model.eval()
with torch.no_grad():
    X_test_tensor_flattened = X_test_scaled_tensor.view(X_test_scaled_tensor.size(0), -1)
    y_pred_scaled = model(X_test_scaled_tensor).squeeze().cpu().numpy()

# Rücktransformation der skalierten Vorhersagen
y_pred = y_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
y_test = y_scaler.inverse_transform(y_test_scaled.reshape(-1, 1)).flatten()

# mse berechnen mit scikit-learn
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse:.2f}')


Mean Squared Error: 0.10


In [24]:
df = pd.DataFrame({'Real': y_test, 'Predicted': y_pred, 'Naive': y_naive})
df.sort_values(by='Real', inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Real,Predicted,Naive
0,19.041290,19.505779,19.367550
1,19.081844,19.416338,18.965121
2,19.122499,21.100498,21.389051
3,19.262127,19.526485,19.538585
4,19.282127,19.616781,19.426512
...,...,...,...
386,26.613609,26.657362,26.678831
387,27.108358,25.635265,25.053971
388,27.258415,26.518957,26.662905
389,27.276836,27.121414,27.311575


In [28]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index, y=df['Real'], mode='lines', name='Real Values'))
fig.add_trace(go.Scatter(x=df.index, y=df['Predicted'], mode='lines', name='Predicted Values'))
fig.add_trace(go.Scatter(x=df.index, y=df['Naive'], mode='lines', name='Naive'))
fig.update_layout(title='Real vs Predicted Values',
                   xaxis_title='',
                   yaxis_title='Temerature')



In [26]:
# save the current date and time in the format YYYY-MM-DD_HH-MM-SS
time = str(pd.Timestamp.now().strftime('%Y-%m-%d_%H-%M-%S'))
# create a directory to save the model
os.makedirs(f'NeuralNetworks/models/{time}', exist_ok=True)
fpath_model = f'NeuralNetworks/models/{time}/model.pt'
fpath_x_scaler = f'NeuralNetworks/models/{time}/x_scaler.pkl'
fpath_y_scaler = f'NeuralNetworks/models/{time}/y_scaler.pkl'
fpath_encoder = f'NeuralNetworks/models/{time}/encoder.pkl'

# Save the model
torch.save(model, fpath_model)

# Save the scalers
joblib.dump(x_scaler, fpath_x_scaler)
joblib.dump(y_scaler, fpath_y_scaler)

# Save the encoder
joblib.dump(encoder, fpath_encoder)

['NeuralNetworks/models/2024-06-26_11-58-05/encoder.pkl']