In [1]:
import shutil
import os

if os.path.exists("logs"):
    shutil.rmtree("logs")
    print("Logs directory cleared")
else:
    print("No logs directory found")

Logs directory cleared


In [2]:
from utils import start_tensorboard

start_tensorboard()

2025-11-28 09:24:01.630902: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.




# Introduction to Keras and Layer types
Previously you built a simple artificial neural net (ANN) with one hidden layer and sigmoid activation function. Today we will take a look at a few adaptations of such a basic net both in terms of NN architecture as well as layer types. We will still stay with Fully Connected Neural Networks for this first exercise!

things to go through:
- activation functions (issue with sigmoid -> ReLU, Leaky ReLU)
- drop out
- skip connections
- deeper neural nets
- potential issues of Fully Connected Neural Networks (Scaling?)
- Keras things like functional API

## Setup things
Make sure your environment has all the required packages available. Take care to have
- keras (Read [This short guide](https://keras.io/getting_started/))
- a backend of your choosing for Keras (I dont care which one you use but we will stay with Tensorflow for now)
- scikit-learn
- pandas
- tensorboard (optional)

Feel free to use a package manager of your choice (avoid conda) or a premade environment in Azure.
Also make sure you have the data files you need ready, I recommend putting them into a ./data/ subdir

## Loading and preparing the dataset


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("data/energydata_complete.csv")

# For timeseries such as this, are rows really independent?
# Is there a separate approach to this problem? What patterns do we lose by treating the dataset like this?
df['date_time'] = pd.to_datetime(df['date'])

df['dayinweek'] = df['date_time'].dt.day_of_week
df['month'] = df['date_time'].dt.month
df['hour'] = df["date_time"].dt.hour
df['minutes'] = df['date_time'].dt.minute
df.drop(columns=['date_time'], inplace=True)

# how do we keep it random but reproducible?

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["Appliances","date"]), df['Appliances'],random_state=42)
print(X_train.shape, X_test.shape)

sd = StandardScaler()
X_train_scaled = sd.fit_transform(X_train)
X_test_scaled = sd.transform(X_test)
X_train.head(3)

(14801, 31) (4934, 31)


Unnamed: 0,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2,dayinweek,month,hour,minutes
8242,0,21.0,38.163333,18.5,40.326667,20.23,38.0,19.926667,34.526667,18.73,...,75.333333,4.666667,40.0,0.6,6.455464,6.455464,1,3,22,40
10603,0,21.0,40.45,18.39,44.09,22.0,38.29,19.856667,38.863333,19.323333,...,97.0,6.0,50.833333,5.916667,6.797277,6.797277,4,3,8,10
18910,0,24.69,49.476,24.32973,47.159009,26.834,43.84,23.997297,46.901351,22.927027,...,82.333333,2.0,40.0,14.433333,8.52416,8.52416,6,5,0,40


## Vanishing Gradient Problem with Sigmoid

Sigmoid's derivative ranges from 0 to 0.25 (max at the middle).

During backpropagation, gradients multiply layer-by-layer: 0.25 × 0.25 × 0.25...

**Result:** Gradients shrink exponentially in deeper networks → early layers barely learn. **Deeper Networks fail with sigmoid**

**Solution:** Use e.g. ReLU (derivative = 1 for positive inputs).

In [4]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
print(f"Using Keras {keras.__version__} with backend: {keras.backend.backend()}")

Using Keras 3.12.0 with backend: tensorflow


In [5]:
import keras
from utils import train_model,eval_model

model_sigmoid = keras.models.Sequential([
    keras.layers.Input(shape=(X_train_scaled.shape[1],)),
    keras.layers.Dense(512, activation="sigmoid"),
    keras.layers.Dense(256, activation="sigmoid"),
    keras.layers.Dense(128, activation="sigmoid"),
    keras.layers.Dense(64, activation="sigmoid"),
    keras.layers.Dense(32, activation="sigmoid"),
    keras.layers.Dense(1, activation=None)
])

model_sigmoid.compile(optimizer="adam", loss="mse", metrics=["mae"])
model_sigmoid.summary()

I0000 00:00:1764322001.098192    6065 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14793 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0001:00:00.0, compute capability: 7.5


In [6]:
model_relu = keras.models.Sequential([
    keras.layers.Input(shape=(X_train_scaled.shape[1],)),
    keras.layers.Dense(512, activation="relu"),
    keras.layers.Dense(256, activation="relu"),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dense(1, activation=None)
])

model_relu.compile(optimizer="adam", loss="mse", metrics=["mae"])
model_relu.summary()

In [7]:
model_leaky_relu = keras.models.Sequential([
    keras.layers.Input(shape=(X_train_scaled.shape[1],)), #
    keras.layers.Dense(512),
    keras.layers.LeakyReLU(negative_slope=0.01),
    keras.layers.Dense(256),
    keras.layers.LeakyReLU(negative_slope=0.01),
    keras.layers.Dense(128),
    keras.layers.LeakyReLU(negative_slope=0.01),
    keras.layers.Dense(64),
    keras.layers.LeakyReLU(negative_slope=0.01),
    keras.layers.Dense(32),
    keras.layers.LeakyReLU(negative_slope=0.01),
    keras.layers.Dense(1, activation=None)
])

model_leaky_relu.compile(optimizer="adam", loss="mse", metrics=["mae"])
model_leaky_relu.summary()

In [8]:
history_sigmoid = train_model(model_sigmoid, X_train_scaled, y_train, "sigmoid_deep")
history_relu = train_model(model_relu, X_train_scaled, y_train, "relu_deep")
history_leaky_relu = train_model(model_leaky_relu, X_train_scaled, y_train, "leaky_relu_deep")

Epoch 1/50


2025-11-28 09:26:46.034966: I external/local_xla/xla/service/service.cc:163] XLA service 0x78bbe8002880 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-11-28 09:26:46.034985: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2025-11-28 09:26:46.105259: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-11-28 09:26:46.376421: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91002
I0000 00:00:1764322007.927475    6689 device_compiler.h:196] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - loss: 19283.8359 - mae: 92.2565 - val_loss: 18211.6582 - val_mae: 87.9893
Epoch 2/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 17966.8828 - mae: 84.8941 - val_loss: 17052.3359 - val_mae: 81.1344
Epoch 3/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 16888.1621 - mae: 78.2817 - val_loss: 16050.7002 - val_mae: 74.8012
Epoch 4/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 15943.1680 - mae: 72.1568 - val_loss: 15169.9463 - val_mae: 68.8446
Epoch 5/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 15110.7812 - mae: 66.4718 - val_loss: 14392.2549 - val_mae: 63.4386
Epoch 6/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 14375.8105 - mae: 61.3111 - val_loss: 13709.2451 - val_mae: 58.4513
Epoch 7/50
[1m370/370[0m [32m━━━━━━━

In [9]:
# eval on test set
from utils import eval_model

eval_model(model_sigmoid, X_test_scaled, y_test, "Sigmoid Deep Network")
eval_model(model_relu, X_test_scaled, y_test, "ReLU Deep Network")
eval_model(model_leaky_relu, X_test_scaled, y_test, "Leaky ReLU Deep Network")


  Sigmoid Deep Network - Test Results
  Test Loss (MSE): 9,104.31
  Test MAE:        50.20
  R² Score:        0.0835 (8.35% variance explained)


  ReLU Deep Network - Test Results
  Test Loss (MSE): 6,479.95
  Test MAE:        40.17
  R² Score:        0.3477 (34.77% variance explained)


  Leaky ReLU Deep Network - Test Results
  Test Loss (MSE): 6,491.58
  Test MAE:        38.00
  R² Score:        0.3465 (34.65% variance explained)



(6491.57958984375, 37.995059967041016, 0.34652000665664673)

## Dropout - Preventing Overfitting

Deep networks can **memorize trained on data** instead of learning generalizations.

**Dropout** randomly sets a percentage of neurons to 0 during each training step (e.g., 30% dropout → 30% of neurons turned off).

This forces the network to learn **robust features** that don't rely on a specific small subset of neurons. We will see on the tensorboard curves, that the model is learning slower but also less likely to overfit.

In [10]:
## Dropout 
model_with_dropout = keras.models.Sequential([
    keras.layers.Input(shape=(X_train_scaled.shape[1],)),
    keras.layers.Dense(512, activation="relu"),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(256, activation="relu"),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1, activation=None)
])

model_with_dropout.compile(optimizer="adam", loss="mse", metrics=["mae"])
model_with_dropout.summary()

In [11]:
history_with_dropout = train_model(model_with_dropout, X_train_scaled, y_train, "with_dropout", epochs=50)

Epoch 1/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 10987.4287 - mae: 58.4575 - val_loss: 8610.9893 - val_mae: 55.8835
Epoch 2/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 9557.0576 - mae: 53.7434 - val_loss: 8770.8076 - val_mae: 51.3907
Epoch 3/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 9251.0391 - mae: 52.5881 - val_loss: 8170.5386 - val_mae: 47.4784
Epoch 4/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 9108.9258 - mae: 52.3160 - val_loss: 8051.7231 - val_mae: 44.7002
Epoch 5/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 8895.9346 - mae: 51.0622 - val_loss: 7868.9009 - val_mae: 44.4100
Epoch 6/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 8682.9902 - mae: 50.1359 - val_loss: 8094.6699 - val_mae: 42.3243
Epoch 7/50
[1m370/370[0m [32m━━━━━━━

In [12]:
eval_model(model_with_dropout, X_test_scaled, y_test, "ReLU with Dropout")


  ReLU with Dropout - Test Results
  Test Loss (MSE): 6,366.24
  Test MAE:        38.32
  R² Score:        0.3591 (35.91% variance explained)



(6366.236328125, 38.32337188720703, 0.3591378331184387)

# Batchnorm
**Problem:** During training, the distribution of inputs to each layer changes as previous layers update (internal covariate shift), slowing down training.

**Batch Normalization** normalizes layer inputs by computing mean and standard deviation across the batch, then scaling and shifting with learned parameters.

Batchnorm is quite optional, at the end the effect on performance should be tested empirically. 


In [13]:
model_with_batchnorm = keras.models.Sequential([
    keras.layers.Input(shape=(X_train_scaled.shape[1],)),
    keras.layers.Dense(512),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("relu"),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(256),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("relu"),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(128),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("relu"),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(64),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("relu"),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(32),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("relu"),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1, activation=None)
])

model_with_batchnorm.compile(optimizer="adam", loss="mse", metrics=["mae"])
model_with_batchnorm.summary()

In [14]:
history_with_batchnorm = train_model(model_with_batchnorm, X_train_scaled, y_train, "with_dropout_and_batchnorm", epochs=50)

Epoch 1/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - loss: 19226.9629 - mae: 93.8755 - val_loss: 17628.1152 - val_mae: 90.1470
Epoch 2/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 15372.6152 - mae: 77.9366 - val_loss: 13211.6660 - val_mae: 72.1231
Epoch 3/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 11405.7930 - mae: 61.2458 - val_loss: 9978.3887 - val_mae: 57.9611
Epoch 4/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 9367.0654 - mae: 54.7200 - val_loss: 8333.6895 - val_mae: 49.9826
Epoch 5/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 8649.6504 - mae: 52.0581 - val_loss: 7633.7500 - val_mae: 46.7768
Epoch 6/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 8377.4834 - mae: 50.2309 - val_loss: 7531.3257 - val_mae: 49.5651
Epoch 7/50
[1m370/370[0m [32m━━━

In [15]:
eval_model(model_with_batchnorm, X_test_scaled, y_test, "ReLU with Dropout and Batchnorm")


  ReLU with Dropout and Batchnorm - Test Results
  Test Loss (MSE): 6,268.93
  Test MAE:        39.04
  R² Score:        0.3689 (36.89% variance explained)



(6268.9306640625, 39.04313659667969, 0.36893320083618164)


## Keras Functional API

The **Sequential API** is simple but limited - layers stack linearly, no branching or multiple inputs/outputs. It is quite rigid in how we can define and interact with the architecture.

The **Functional API** is more flexible (and follows typical functional programming styles - think lambda):
- Multiple inputs/outputs
- Layer sharing
- Branching and merging
- Skip connections (residual networks)
- also it allows us to dynamically construct different architectures e.g. for hyperparam search.


In [16]:
# Functional API version of dropout model
inputs = keras.layers.Input(shape=(X_train_scaled.shape[1],))

x = keras.layers.Dense(512, activation="relu")(inputs)
x = keras.layers.Dropout(0.3)(x)

x = keras.layers.Dense(256, activation="relu")(x)
x = keras.layers.Dropout(0.3)(x)

x = keras.layers.Dense(128, activation="relu")(x)
x = keras.layers.Dropout(0.3)(x)

x = keras.layers.Dense(64, activation="relu")(x)
x = keras.layers.Dropout(0.2)(x)

x = keras.layers.Dense(32, activation="relu")(x)
x = keras.layers.Dropout(0.2)(x)

outputs = keras.layers.Dense(1, activation=None)(x)

model_functional = keras.Model(inputs=inputs, outputs=outputs, name="functional_dropout")
model_functional.compile(optimizer="adam", loss="mse", metrics=["mae"])
model_functional.summary()

In [17]:
# Your turn, implement a skip connection from the first dense+dropout layer block to the last (excluding output layer)