In [1]:
import shutil
import os

if os.path.exists("logs"):
    shutil.rmtree("logs")
    print("Logs directory cleared")
else:
    print("No logs directory found")

Logs directory cleared


In [2]:
from utils import start_tensorboard

start_tensorboard()

2025-11-28 13:39:58.215276: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.




# Introduction to Keras and Layer types
Previously you built a simple artificial neural net (ANN) with one hidden layer and sigmoid activation function. Today we will take a look at a few adaptations of such a basic net both in terms of NN architecture as well as layer types. We will still stay with Fully Connected Neural Networks for this first exercise!

things to go through:
- activation functions (issue with sigmoid -> ReLU, Leaky ReLU)
- drop out
- skip connections
- deeper neural nets
- potential issues of Fully Connected Neural Networks (Scaling?)
- Keras things like functional API

## Setup things
Make sure your environment has all the required packages available. Take care to have
- keras (Read [This short guide](https://keras.io/getting_started/))
- a backend of your choosing for Keras (I dont care which one you use but we will stay with Tensorflow for now)
- scikit-learn
- pandas
- tensorboard (optional)

Feel free to use a package manager of your choice (avoid conda) or a premade environment in Azure.
Also make sure you have the data files you need ready, I recommend putting them into a ./data/ subdir

## Loading and preparing the dataset


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("data/energydata_complete.csv")

# For timeseries such as this, are rows really independent?
# Is there a separate approach to this problem? What patterns do we lose by treating the dataset like this?
df['date_time'] = pd.to_datetime(df['date'])

df['dayinweek'] = df['date_time'].dt.day_of_week
df['month'] = df['date_time'].dt.month
df['hour'] = df["date_time"].dt.hour
df['minutes'] = df['date_time'].dt.minute
df.drop(columns=['date_time'], inplace=True)

# how do we keep it random but reproducible?

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["Appliances","date"]), df['Appliances'],random_state=42)
print(X_train.shape, X_test.shape)

sd = StandardScaler()
X_train_scaled = sd.fit_transform(X_train)
X_test_scaled = sd.transform(X_test)
X_train.head(3)

(14801, 31) (4934, 31)


Unnamed: 0,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2,dayinweek,month,hour,minutes
8242,0,21.0,38.163333,18.5,40.326667,20.23,38.0,19.926667,34.526667,18.73,...,75.333333,4.666667,40.0,0.6,6.455464,6.455464,1,3,22,40
10603,0,21.0,40.45,18.39,44.09,22.0,38.29,19.856667,38.863333,19.323333,...,97.0,6.0,50.833333,5.916667,6.797277,6.797277,4,3,8,10
18910,0,24.69,49.476,24.32973,47.159009,26.834,43.84,23.997297,46.901351,22.927027,...,82.333333,2.0,40.0,14.433333,8.52416,8.52416,6,5,0,40


## Vanishing Gradient Problem with Sigmoid

Sigmoid's derivative ranges from 0 to 0.25 (max at the middle).

During backpropagation, gradients multiply layer-by-layer: 0.25 × 0.25 × 0.25...

**Result:** Gradients shrink exponentially in deeper networks → early layers barely learn. **Deeper Networks fail with sigmoid**

**Solution:** Use e.g. ReLU (derivative = 1 for positive inputs).

In [4]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
print(f"Using Keras {keras.__version__} with backend: {keras.backend.backend()}")

Using Keras 3.12.0 with backend: tensorflow


In [5]:
import keras
from utils import train_model,eval_model

model_sigmoid = keras.models.Sequential([
    keras.layers.Input(shape=(X_train_scaled.shape[1],)),
    keras.layers.Dense(512, activation="sigmoid"),
    keras.layers.Dense(256, activation="sigmoid"),
    keras.layers.Dense(128, activation="sigmoid"),
    keras.layers.Dense(64, activation="sigmoid"),
    keras.layers.Dense(32, activation="sigmoid"),
    keras.layers.Dense(1, activation=None)
])

model_sigmoid.compile(optimizer="adam", loss="mse", metrics=["mae"])
model_sigmoid.summary()

I0000 00:00:1764337355.284428   66369 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14793 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0001:00:00.0, compute capability: 7.5


In [6]:
model_relu = keras.models.Sequential([
    keras.layers.Input(shape=(X_train_scaled.shape[1],)),
    keras.layers.Dense(512, activation="relu"),
    keras.layers.Dense(256, activation="relu"),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dense(1, activation=None)
])

model_relu.compile(optimizer="adam", loss="mse", metrics=["mae"])
model_relu.summary()

In [7]:
history_sigmoid = train_model(model_sigmoid, X_train_scaled, y_train, "sigmoid_deep")

Epoch 1/50


2025-11-28 13:42:39.807833: I external/local_xla/xla/service/service.cc:163] XLA service 0x785448018040 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-11-28 13:42:39.807852: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2025-11-28 13:42:39.891155: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-11-28 13:42:40.086265: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91002
I0000 00:00:1764337361.206580   67001 device_compiler.h:196] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - loss: 19129.6387 - mae: 91.4346 - val_loss: 18055.2422 - val_mae: 87.0961
Epoch 2/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 17827.2129 - mae: 84.0037 - val_loss: 16926.0000 - val_mae: 80.3521
Epoch 3/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 16768.1855 - mae: 77.5476 - val_loss: 15938.7588 - val_mae: 74.0716
Epoch 4/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 15834.2607 - mae: 71.4547 - val_loss: 15049.3076 - val_mae: 67.9905
Epoch 5/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 14928.8896 - mae: 65.2705 - val_loss: 14180.2822 - val_mae: 61.8898
Epoch 6/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 14148.4482 - mae: 59.7577 - val_loss: 13470.1846 - val_mae: 56.9659
Epoch 7/50
[1m370/370[0m [32m━━━━━━━

In [8]:
history_relu = train_model(model_relu, X_train_scaled, y_train, "relu_deep")

Epoch 1/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 10089.2734 - mae: 56.3056 - val_loss: 8735.1426 - val_mae: 47.9237
Epoch 2/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 8761.2227 - mae: 51.3653 - val_loss: 7947.0361 - val_mae: 50.6367
Epoch 3/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 8337.9375 - mae: 49.6191 - val_loss: 8001.3003 - val_mae: 55.1972
Epoch 4/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 8062.0005 - mae: 48.4892 - val_loss: 7839.4712 - val_mae: 50.9483
Epoch 5/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 7756.9282 - mae: 46.8395 - val_loss: 7707.4185 - val_mae: 51.4732
Epoch 6/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 7479.3350 - mae: 46.1830 - val_loss: 7382.9404 - val_mae: 45.9567
Epoch 7/50
[1m370/370[0m [32m━━━━━━━

In [9]:
model_leaky_relu = keras.models.Sequential([
    keras.layers.Input(shape=(X_train_scaled.shape[1],)), #
    keras.layers.Dense(512),
    keras.layers.LeakyReLU(negative_slope=0.01),
    keras.layers.Dense(256),
    keras.layers.LeakyReLU(negative_slope=0.01),
    keras.layers.Dense(128),
    keras.layers.LeakyReLU(negative_slope=0.01),
    keras.layers.Dense(64),
    keras.layers.LeakyReLU(negative_slope=0.01),
    keras.layers.Dense(32),
    keras.layers.LeakyReLU(negative_slope=0.01),
    keras.layers.Dense(1, activation=None)
])

model_leaky_relu.compile(optimizer="adam", loss="mse", metrics=["mae"])
model_leaky_relu.summary()

In [10]:
history_sigmoid = train_model(model_sigmoid, X_train_scaled, y_train, "sigmoid_deep")
history_relu = train_model(model_relu, X_train_scaled, y_train, "relu_deep")
history_leaky_relu = train_model(model_leaky_relu, X_train_scaled, y_train, "leaky_relu_deep")

Epoch 1/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 9799.1221 - mae: 52.6296 - val_loss: 9590.9775 - val_mae: 51.4456
Epoch 2/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 9781.2451 - mae: 53.1472 - val_loss: 9592.0762 - val_mae: 52.0342
Epoch 3/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 9766.7373 - mae: 53.3801 - val_loss: 9519.6328 - val_mae: 53.5035
Epoch 4/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9744.6172 - mae: 53.7025 - val_loss: 9598.3213 - val_mae: 52.4677
Epoch 5/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 9738.8545 - mae: 53.7047 - val_loss: 9566.7656 - val_mae: 52.9837
Epoch 6/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9724.3428 - mae: 53.7566 - val_loss: 9552.9238 - val_mae: 53.3302
Epoch 7/50
[1m370/370[0m [32m━━━━━━━━

In [11]:
# eval on test set
from utils import eval_model

eval_model(model_sigmoid, X_test_scaled, y_test, "Sigmoid Deep Network")
eval_model(model_relu, X_test_scaled, y_test, "ReLU Deep Network")
eval_model(model_leaky_relu, X_test_scaled, y_test, "Leaky ReLU Deep Network")


  Sigmoid Deep Network - Test Results
  Test Loss (MSE): 6,995.87
  Test MAE:        41.24
  R² Score:        0.2958 (29.58% variance explained)


  ReLU Deep Network - Test Results
  Test Loss (MSE): 6,578.61
  Test MAE:        40.52
  R² Score:        0.3378 (33.78% variance explained)


  Leaky ReLU Deep Network - Test Results
  Test Loss (MSE): 6,340.78
  Test MAE:        37.84
  R² Score:        0.3617 (36.17% variance explained)



(6340.7841796875, 37.839847564697266, 0.36170005798339844)

## Dropout - Preventing Overfitting

Deep networks can **memorize trained-on data** instead of learning generalizations.

**Dropout** randomly sets a percentage of neurons to 0 during each training step (e.g., 30% dropout → 30% of neurons turned off).

This forces the network to learn **robust features** that don't rely on a specific small subset of neurons. We will see on the tensorboard curves, that the model is learning slower but also less likely to overfit.

In [12]:
## Dropout 
model_with_dropout = keras.models.Sequential([
    keras.layers.Input(shape=(X_train_scaled.shape[1],)),
    keras.layers.Dense(512, activation="relu"),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(256, activation="relu"),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1, activation=None)
])

model_with_dropout.compile(optimizer="adam", loss="mse", metrics=["mae"])
model_with_dropout.summary()

In [13]:
history_with_dropout = train_model(model_with_dropout, X_train_scaled, y_train, "with_dropout", epochs=50)

Epoch 1/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 11192.0879 - mae: 59.2315 - val_loss: 8732.8154 - val_mae: 49.4987
Epoch 2/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 9604.0322 - mae: 53.8756 - val_loss: 8290.1084 - val_mae: 48.9641
Epoch 3/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 9320.7207 - mae: 52.5499 - val_loss: 8146.3379 - val_mae: 51.7225
Epoch 4/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 9087.9004 - mae: 51.8921 - val_loss: 7925.2163 - val_mae: 45.2360
Epoch 5/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 8894.4902 - mae: 50.8196 - val_loss: 7731.0127 - val_mae: 46.5281
Epoch 6/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 8709.3252 - mae: 50.4034 - val_loss: 7845.1509 - val_mae: 43.7179
Epoch 7/50
[1m370/370[0m [32m━━━━━━━

In [14]:
eval_model(model_with_dropout, X_test_scaled, y_test, "ReLU with Dropout")


  ReLU with Dropout - Test Results
  Test Loss (MSE): 6,527.25
  Test MAE:        38.95
  R² Score:        0.3429 (34.29% variance explained)



(6527.25390625, 38.947418212890625, 0.3429288864135742)

# Batchnorm
**Problem:** During training, the distribution of inputs to each layer changes as previous layers update (internal covariate shift), slowing down training.

**Batch Normalization** normalizes layer inputs by computing mean and standard deviation across the batch, then scaling and shifting with learned parameters.

Batchnorm is quite optional, at the end the effect on performance should be tested empirically. 


In [15]:
model_with_batchnorm = keras.models.Sequential([
    keras.layers.Input(shape=(X_train_scaled.shape[1],)),
    keras.layers.Dense(512),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("relu"),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(256),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("relu"),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(128),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("relu"),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(64),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("relu"),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(32),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("relu"),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1, activation=None)
])

model_with_batchnorm.compile(optimizer="adam", loss="mse", metrics=["mae"])
model_with_batchnorm.summary()

In [16]:
history_with_batchnorm = train_model(model_with_batchnorm, X_train_scaled, y_train, "with_dropout_and_batchnorm", epochs=50)

Epoch 1/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - loss: 19177.2949 - mae: 93.5353 - val_loss: 16597.6270 - val_mae: 84.8020
Epoch 2/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 15473.8770 - mae: 77.9848 - val_loss: 12693.8477 - val_mae: 68.3124
Epoch 3/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 11613.6943 - mae: 61.6725 - val_loss: 9332.9805 - val_mae: 54.2937
Epoch 4/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 9442.1152 - mae: 53.1804 - val_loss: 8033.9043 - val_mae: 48.2655
Epoch 5/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 8560.6826 - mae: 49.3759 - val_loss: 7746.2471 - val_mae: 46.4028
Epoch 6/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 8264.7305 - mae: 49.3899 - val_loss: 7456.4531 - val_mae: 44.2974
Epoch 7/50
[1m370/370[0m [32m━━━

In [17]:
eval_model(model_with_batchnorm, X_test_scaled, y_test, "ReLU with Dropout and Batchnorm")


  ReLU with Dropout and Batchnorm - Test Results
  Test Loss (MSE): 6,458.78
  Test MAE:        39.62
  R² Score:        0.3498 (34.98% variance explained)



(6458.7763671875, 39.62101745605469, 0.3498222827911377)


## Keras Functional API

The **Sequential API** is simple but limited - layers stack linearly, no branching or multiple inputs/outputs. It is quite rigid in how we can define and interact with the architecture.

The **Functional API** is more flexible (and follows typical functional programming styles - think lambda):
- Multiple inputs/outputs
- Layer sharing
- Branching and merging
- Skip connections (residual networks)
- also it allows us to dynamically construct different architectures e.g. for hyperparam search.


In [18]:
# Functional API version of dropout model
inputs = keras.layers.Input(shape=(X_train_scaled.shape[1],))

x = keras.layers.Dense(512, activation="relu")(inputs)
x = keras.layers.Dropout(0.3)(x)

x = keras.layers.Dense(256, activation="relu")(x)
x = keras.layers.Dropout(0.3)(x)

x = keras.layers.Dense(128, activation="relu")(x)
x = keras.layers.Dropout(0.3)(x)

x = keras.layers.Dense(64, activation="relu")(x)
x = keras.layers.Dropout(0.2)(x)

x = keras.layers.Dense(32, activation="relu")(x)
x = keras.layers.Dropout(0.2)(x)

outputs = keras.layers.Dense(1, activation=None)(x)

model_functional = keras.Model(inputs=inputs, outputs=outputs, name="functional_dropout")
model_functional.compile(optimizer="adam", loss="mse", metrics=["mae"])
model_functional.summary()

In [None]:
# Your turn, implement a skip connection from the first dense+dropout layer block to the last (excluding output layer)
#Hint https://keras.io/api/layers/merging_layers/add/

In [None]:
inputs = keras.layers.Input(shape=(X_train_scaled.shape[1],))

x = keras.layers.Dense(256, activation="relu")(inputs)
x = keras.layers.Dropout(0.3)(x)
skip = x

x = keras.layers.Dense(256, activation="relu")(x)
x = keras.layers.Dropout(0.3)(x)

x = keras.layers.Dense(256, activation="relu")(x)
x = keras.layers.Dropout(0.3)(x)

x = keras.layers.Dense(256, activation="relu")(x)
x = keras.layers.Dropout(0.2)(x)

x = keras.layers.Add()([x, skip])

x = keras.layers.Dense(64, activation="relu")(x)
x = keras.layers.Dropout(0.2)(x)

outputs = keras.layers.Dense(1, activation=None)(x)

model_skip = keras.Model(inputs=inputs, outputs=outputs, name="skip_connection")
model_skip.compile(optimizer="adam", loss="mse", metrics=["mae"])
model_skip.summary()


In [None]:
history_skip = train_model(model_skip, X_train_scaled, y_train, "with_skip_connection", epochs=50)


In [None]:
eval_model(model_skip, X_test_scaled, y_test, "Skip Connection Network")
