In [5]:
import shutil
import os

if os.path.exists("logs"):
    shutil.rmtree("logs")
    print("Logs directory cleared")
else:
    print("No logs directory found")

Logs directory cleared


In [6]:
from utils import start_tensorboard

start_tensorboard()



# Introduction to Keras and Layer types
Previously you built a simple artificial neural net (ANN) with one hidden layer and sigmoid activation function. Today we will take a look at a few adaptations of such a basic net both in terms of NN architecture as well as layer types. We will still stay with Fully Connected Neural Networks for this first exercise!

things to go through:
- activation functions (issue with sigmoid -> ReLU, Leaky ReLU)
- drop out
- skip connections
- deeper neural nets
- potential issues of Fully Connected Neural Networks (Scaling?)
- Keras things like functional API

## Setup things
Make sure your environment has all the required packages available. Take care to have
- keras (Read [This short guide](https://keras.io/getting_started/))
- a backend of your choosing for Keras (I dont care which one you use but we will stay with Tensorflow for now)
- scikit-learn
- pandas
- tensorboard (optional)

Feel free to use a package manager of your choice (avoid conda) or a premade environment in Azure.
Also make sure you have the data files you need ready, I recommend putting them into a ./data/ subdir

## Loading and preparing the dataset


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("data/energydata_complete.csv")

# For timeseries such as this, are rows really independent?
# Is there a separate approach to this problem? What patterns do we lose by treating the dataset like this?
df['date_time'] = pd.to_datetime(df['date'])

df['dayinweek'] = df['date_time'].dt.day_of_week
df['month'] = df['date_time'].dt.month
df['hour'] = df["date_time"].dt.hour
df['minutes'] = df['date_time'].dt.minute
df.drop(columns=['date_time'], inplace=True)

# how do we keep it random but reproducible?

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["Appliances","date"]), df['Appliances'],random_state=42)
print(X_train.shape, X_test.shape)

sd = StandardScaler()
X_train_scaled = sd.fit_transform(X_train)
X_test_scaled = sd.transform(X_test)
X_train.head(3)

(14801, 31) (4934, 31)


Unnamed: 0,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2,dayinweek,month,hour,minutes
8242,0,21.0,38.163333,18.5,40.326667,20.23,38.0,19.926667,34.526667,18.73,...,75.333333,4.666667,40.0,0.6,6.455464,6.455464,1,3,22,40
10603,0,21.0,40.45,18.39,44.09,22.0,38.29,19.856667,38.863333,19.323333,...,97.0,6.0,50.833333,5.916667,6.797277,6.797277,4,3,8,10
18910,0,24.69,49.476,24.32973,47.159009,26.834,43.84,23.997297,46.901351,22.927027,...,82.333333,2.0,40.0,14.433333,8.52416,8.52416,6,5,0,40


## Vanishing Gradient Problem with Sigmoid

Sigmoid's derivative ranges from 0 to 0.25 (max at the middle).

During backpropagation, gradients multiply layer-by-layer: 0.25 × 0.25 × 0.25...

**Result:** Gradients shrink exponentially in deeper networks → early layers barely learn. **Deeper Networks fail with sigmoid**

**Solution:** Use e.g. ReLU (derivative = 1 for positive inputs).

In [8]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
print(f"Using Keras {keras.__version__} with backend: {keras.backend.backend()}")

Using Keras 3.12.0 with backend: tensorflow


In [9]:
import keras
from utils import train_model,eval_model

model_sigmoid = keras.models.Sequential([
    keras.layers.Input(shape=(X_train_scaled.shape[1],)),
    keras.layers.Dense(512, activation="sigmoid"),
    keras.layers.Dense(256, activation="sigmoid"),
    keras.layers.Dense(128, activation="sigmoid"),
    keras.layers.Dense(64, activation="sigmoid"),
    keras.layers.Dense(32, activation="sigmoid"),
    keras.layers.Dense(1, activation=None)
])

model_sigmoid.compile(optimizer="adam", loss="mse", metrics=["mae"])
model_sigmoid.summary()

I0000 00:00:1767173568.474150   46190 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2131 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3050 Ti Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


In [10]:
model_relu = keras.models.Sequential([
    keras.layers.Input(shape=(X_train_scaled.shape[1],)),
    keras.layers.Dense(512, activation="relu"),
    keras.layers.Dense(256, activation="relu"),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dense(1, activation=None)
])

model_relu.compile(optimizer="adam", loss="mse", metrics=["mae"])
model_relu.summary()

In [11]:
history_sigmoid = train_model(model_sigmoid, X_train_scaled, y_train, "sigmoid_deep")

Epoch 1/50


2025-12-31 10:32:49.960218: I external/local_xla/xla/service/service.cc:163] XLA service 0x7c4c5c01e830 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-12-31 10:32:49.960231: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (0): NVIDIA GeForce RTX 3050 Ti Laptop GPU, Compute Capability 8.6
2025-12-31 10:32:49.981868: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-12-31 10:32:50.112944: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91700
2025-12-31 10:32:50.130886: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-12-31 10:32:

[1m120/370[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m0s[0m 1ms/step - loss: 21171.1797 - mae: 99.3070

I0000 00:00:1767173573.570831   50039 device_compiler.h:196] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m335/370[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 1ms/step - loss: 20666.0096 - mae: 97.4355

2025-12-31 10:32:54.747659: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-12-31 10:32:54.747679: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.





[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - loss: 19589.9492 - mae: 93.9026 - val_loss: 18656.9551 - val_mae: 90.4844
Epoch 2/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 18524.5762 - mae: 88.0806 - val_loss: 17673.3457 - val_mae: 84.8754
Epoch 3/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 17574.7051 - mae: 82.5271 - val_loss: 16798.6738 - val_mae: 79.5558
Epoch 4/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 16744.1172 - mae: 77.3870 - val_loss: 16014.8975 - val_mae: 74.5687
Epoch 5/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 15996.5029 - mae: 72.5270 - val_loss: 15308.3428 - val_mae: 69.8154
Epoch 6/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 15319.0596 - mae: 67.9508 - val_loss: 14665.6143 - val_mae: 65.3755
Epoch 7/50
[1m370/370[0m [32m━━━━━━━

In [12]:
history_relu = train_model(model_relu, X_train_scaled, y_train, "relu_deep")

Epoch 1/50


2025-12-31 10:33:28.146020: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-12-31 10:33:28.146038: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-12-31 10:33:28.146061: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-12-31 10:33:28.146068: I external/l

[1m361/370[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 1ms/step - loss: 12208.0030 - mae: 63.8212

2025-12-31 10:33:32.442910: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-12-31 10:33:32.442929: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.



[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - loss: 10251.7471 - mae: 56.7631 - val_loss: 8612.5908 - val_mae: 52.3839
Epoch 2/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 8925.1660 - mae: 52.1865 - val_loss: 8148.7998 - val_mae: 53.1879
Epoch 3/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 8554.2832 - mae: 50.6555 - val_loss: 7877.8965 - val_mae: 51.5344
Epoch 4/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 8216.3574 - mae: 49.2369 - val_loss: 7765.6665 - val_mae: 49.9071
Epoch 5/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 7911.5933 - mae: 47.7091 - val_loss: 7992.9854 - val_mae: 43.6706
Epoch 6/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 7654.1509 - mae: 46.7887 - val_loss: 7716.9971 - val_mae: 42.2088
Epoch 7/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━

In [13]:
model_leaky_relu = keras.models.Sequential([
    keras.layers.Input(shape=(X_train_scaled.shape[1],)), #
    keras.layers.Dense(512),
    keras.layers.LeakyReLU(negative_slope=0.01),
    keras.layers.Dense(256),
    keras.layers.LeakyReLU(negative_slope=0.01),
    keras.layers.Dense(128),
    keras.layers.LeakyReLU(negative_slope=0.01),
    keras.layers.Dense(64),
    keras.layers.LeakyReLU(negative_slope=0.01),
    keras.layers.Dense(32),
    keras.layers.LeakyReLU(negative_slope=0.01),
    keras.layers.Dense(1, activation=None)
])

model_leaky_relu.compile(optimizer="adam", loss="mse", metrics=["mae"])
model_leaky_relu.summary()

In [14]:
history_sigmoid = train_model(model_sigmoid, X_train_scaled, y_train, "sigmoid_deep")
history_relu = train_model(model_relu, X_train_scaled, y_train, "relu_deep")
history_leaky_relu = train_model(model_leaky_relu, X_train_scaled, y_train, "leaky_relu_deep")

Epoch 1/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9862.8535 - mae: 50.9699 - val_loss: 9571.0566 - val_mae: 51.0550
Epoch 2/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9824.3877 - mae: 51.4436 - val_loss: 9565.3936 - val_mae: 51.4369
Epoch 3/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9789.7061 - mae: 51.9105 - val_loss: 9559.0078 - val_mae: 51.2827
Epoch 4/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9772.4004 - mae: 52.2163 - val_loss: 9513.9814 - val_mae: 52.5325
Epoch 5/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9737.7637 - mae: 52.5927 - val_loss: 9506.4258 - val_mae: 51.7898
Epoch 6/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9727.4014 - mae: 52.3962 - val_loss: 9488.3643 - val_mae: 51.1578
Epoch 7/50
[1m370/370[0m [32m━━━━━━━━

2025-12-31 10:35:29.681628: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-12-31 10:35:29.681656: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-12-31 10:35:29.681694: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-12-31 10:35:29.681707: I external/l

[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 11886.0342 - mae: 63.4567

2025-12-31 10:35:41.841260: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-12-31 10:35:41.841289: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.





[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 16ms/step - loss: 10147.9229 - mae: 56.4717 - val_loss: 8761.1621 - val_mae: 60.2613
Epoch 2/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 8876.9023 - mae: 52.1919 - val_loss: 8240.2441 - val_mae: 49.6351
Epoch 3/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 8446.1543 - mae: 50.4230 - val_loss: 8183.9844 - val_mae: 54.7292
Epoch 4/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 8203.0742 - mae: 49.1297 - val_loss: 7595.3936 - val_mae: 50.8336
Epoch 5/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 7852.0234 - mae: 47.6742 - val_loss: 7505.0483 - val_mae: 46.1712
Epoch 6/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 7696.9961 - mae: 47.0996 - val_loss: 7653.6685 - val_mae: 51.4325
Epoch 7/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━

In [15]:
# eval on test set
from utils import eval_model

eval_model(model_sigmoid, X_test_scaled, y_test, "Sigmoid Deep Network")
eval_model(model_relu, X_test_scaled, y_test, "ReLU Deep Network")
eval_model(model_leaky_relu, X_test_scaled, y_test, "Leaky ReLU Deep Network")


  Sigmoid Deep Network - Test Results
  Test Loss (MSE): 7,130.80
  Test MAE:        44.73
  R² Score:        0.2822 (28.22% variance explained)







  ReLU Deep Network - Test Results
  Test Loss (MSE): 6,543.41
  Test MAE:        38.29
  R² Score:        0.3413 (34.13% variance explained)


  Leaky ReLU Deep Network - Test Results
  Test Loss (MSE): 6,623.92
  Test MAE:        39.68
  R² Score:        0.3332 (33.32% variance explained)



(6623.919921875, 39.68260192871094, 0.3331978917121887)

## Dropout - Preventing Overfitting

Deep networks can **memorize trained-on data** instead of learning generalizations.

**Dropout** randomly sets a percentage of neurons to 0 during each training step (e.g., 30% dropout → 30% of neurons turned off).

This forces the network to learn **robust features** that don't rely on a specific small subset of neurons. We will see on the tensorboard curves, that the model is learning slower but also less likely to overfit.

In [16]:
## Dropout 
model_with_dropout = keras.models.Sequential([
    keras.layers.Input(shape=(X_train_scaled.shape[1],)),
    keras.layers.Dense(512, activation="relu"),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(256, activation="relu"),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1, activation=None)
])

model_with_dropout.compile(optimizer="adam", loss="mse", metrics=["mae"])
model_with_dropout.summary()

In [17]:
history_with_dropout = train_model(model_with_dropout, X_train_scaled, y_train, "with_dropout", epochs=50)

Epoch 1/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - loss: 10964.1367 - mae: 59.0064 - val_loss: 8600.1650 - val_mae: 51.4901
Epoch 2/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9598.4199 - mae: 54.3583 - val_loss: 8409.8740 - val_mae: 50.2456
Epoch 3/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9347.0293 - mae: 53.0982 - val_loss: 8013.3110 - val_mae: 48.6670
Epoch 4/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9102.1582 - mae: 52.1837 - val_loss: 8153.0684 - val_mae: 43.8927
Epoch 5/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 8941.9707 - mae: 51.7640 - val_loss: 8105.0913 - val_mae: 43.9611
Epoch 6/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 8708.2979 - mae: 50.9166 - val_loss: 7722.6626 - val_mae: 47.6425
Epoch 7/50
[1m370/370[0m [32m━━━━━━━

In [18]:
eval_model(model_with_dropout, X_test_scaled, y_test, "ReLU with Dropout")


  ReLU with Dropout - Test Results
  Test Loss (MSE): 6,401.54
  Test MAE:        39.21
  R² Score:        0.3556 (35.56% variance explained)



(6401.5390625, 39.21120834350586, 0.35558390617370605)

# Batchnorm
**Problem:** During training, the distribution of inputs to each layer changes as previous layers update (internal covariate shift), slowing down training.

**Batch Normalization** normalizes layer inputs by computing mean and standard deviation across the batch, then scaling and shifting with learned parameters.

Batchnorm is quite optional, at the end the effect on performance should be tested empirically. 


In [19]:
model_with_batchnorm = keras.models.Sequential([
    keras.layers.Input(shape=(X_train_scaled.shape[1],)),
    keras.layers.Dense(512),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("relu"),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(256),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("relu"),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(128),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("relu"),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(64),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("relu"),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(32),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("relu"),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1, activation=None)
])

model_with_batchnorm.compile(optimizer="adam", loss="mse", metrics=["mae"])
model_with_batchnorm.summary()

In [20]:
history_with_batchnorm = train_model(model_with_batchnorm, X_train_scaled, y_train, "with_dropout_and_batchnorm", epochs=50)

Epoch 1/50
[1m362/370[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - loss: 19973.1284 - mae: 95.8274

2025-12-31 10:37:20.396533: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-12-31 10:37:20.396551: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.


2025-12-31 10:37:21.847033: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-12-31 10:37:21.847052: I external

[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - loss: 18925.6406 - mae: 92.5392 - val_loss: 16930.7539 - val_mae: 86.7863
Epoch 2/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 15003.3477 - mae: 75.8577 - val_loss: 12775.1953 - val_mae: 67.7372
Epoch 3/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 11116.8555 - mae: 60.0088 - val_loss: 9787.5674 - val_mae: 54.3522
Epoch 4/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9151.6035 - mae: 52.9572 - val_loss: 8252.5127 - val_mae: 47.7353
Epoch 5/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 8566.0127 - mae: 50.3451 - val_loss: 8204.6035 - val_mae: 50.4331
Epoch 6/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 8292.3984 - mae: 49.6837 - val_loss: 8047.0605 - val_mae: 47.8134
Epoch 7/50
[1m370/370[0m [32m━━━━━━━━━━━━━

In [21]:
eval_model(model_with_batchnorm, X_test_scaled, y_test, "ReLU with Dropout and Batchnorm")


  ReLU with Dropout and Batchnorm - Test Results
  Test Loss (MSE): 6,284.54
  Test MAE:        38.61
  R² Score:        0.3674 (36.74% variance explained)



(6284.53955078125, 38.61460876464844, 0.3673619031906128)


## Keras Functional API

The **Sequential API** is simple but limited - layers stack linearly, no branching or multiple inputs/outputs. It is quite rigid in how we can define and interact with the architecture.

The **Functional API** is more flexible (and follows typical functional programming styles - think lambda):
- Multiple inputs/outputs
- Layer sharing
- Branching and merging
- Skip connections (residual networks)
- also it allows us to dynamically construct different architectures e.g. for hyperparam search.


In [22]:
# Functional API version of dropout model
inputs = keras.layers.Input(shape=(X_train_scaled.shape[1],))

x = keras.layers.Dense(512, activation="relu")(inputs)
x = keras.layers.Dropout(0.3)(x)

x = keras.layers.Dense(256, activation="relu")(x)
x = keras.layers.Dropout(0.3)(x)

x = keras.layers.Dense(128, activation="relu")(x)
x = keras.layers.Dropout(0.3)(x)

x = keras.layers.Dense(64, activation="relu")(x)
x = keras.layers.Dropout(0.2)(x)

x = keras.layers.Dense(32, activation="relu")(x)
x = keras.layers.Dropout(0.2)(x)

outputs = keras.layers.Dense(1, activation=None)(x)

model_functional = keras.Model(inputs=inputs, outputs=outputs, name="functional_dropout")
model_functional.compile(optimizer="adam", loss="mse", metrics=["mae"])
model_functional.summary()

In [23]:
# Your turn, implement a skip connection from the first dense+dropout layer block to the last (excluding output layer)
#Hint https://keras.io/api/layers/merging_layers/add/

In [24]:
inputs = keras.layers.Input(shape=(X_train_scaled.shape[1],))

x = keras.layers.Dense(256, activation="relu")(inputs)
x = keras.layers.Dropout(0.3)(x)
skip = x

x = keras.layers.Dense(256, activation="relu")(x)
x = keras.layers.Dropout(0.3)(x)

x = keras.layers.Dense(256, activation="relu")(x)
x = keras.layers.Dropout(0.3)(x)

x = keras.layers.Dense(256, activation="relu")(x)
x = keras.layers.Dropout(0.2)(x)

x = keras.layers.Add()([x, skip])

x = keras.layers.Dense(64, activation="relu")(x)
x = keras.layers.Dropout(0.2)(x)

outputs = keras.layers.Dense(1, activation=None)(x)

model_skip = keras.Model(inputs=inputs, outputs=outputs, name="skip_connection")
model_skip.compile(optimizer="adam", loss="mse", metrics=["mae"])
model_skip.summary()


In [25]:
history_skip = train_model(model_skip, X_train_scaled, y_train, "with_skip_connection", epochs=50)


Epoch 1/50


2025-12-31 10:38:10.321458: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-12-31 10:38:10.321479: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-12-31 10:38:10.321510: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
2025-12-31 10:38:10.321522: I external/l

[1m337/370[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 1ms/step - loss: 11538.4559 - mae: 62.2654

2025-12-31 10:38:14.505058: I external/local_xla/xla/service/gpu/autotuning/dot_search_space.cc:208] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.



[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - loss: 10358.1445 - mae: 56.9446 - val_loss: 9336.3174 - val_mae: 46.4785
Epoch 2/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9439.2158 - mae: 53.0273 - val_loss: 8127.0645 - val_mae: 48.9255
Epoch 3/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9016.0615 - mae: 51.5619 - val_loss: 8159.6738 - val_mae: 44.5611
Epoch 4/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 8815.8086 - mae: 50.6300 - val_loss: 7842.3423 - val_mae: 46.8623
Epoch 5/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 8520.0918 - mae: 49.0998 - val_loss: 8081.6182 - val_mae: 42.3508
Epoch 6/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 8348.3916 - mae: 48.3315 - val_loss: 7459.7256 - val_mae: 45.5031
Epoch 7/50
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━

In [26]:
eval_model(model_skip, X_test_scaled, y_test, "Skip Connection Network")



  Skip Connection Network - Test Results
  Test Loss (MSE): 6,377.97
  Test MAE:        38.43
  R² Score:        0.3580 (35.80% variance explained)



(6377.9677734375, 38.42842102050781, 0.35795682668685913)