In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, root_mean_squared_error

2025-04-26 16:28:16.061624: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-26 16:28:16.073995: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745681296.085691   66006 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745681296.089983   66006 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745681296.098867   66006 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
# Load dataset
df = pd.read_csv('dublin_connolly_clean_with_history.csv')

# Define previous station columns
prev_stations_num = 10
prev_stations = [f'prev_station_{i}' for i in range(1, prev_stations_num+1)]
prev_delays = [f'prev_delay_{i}' for i in range(1, prev_stations_num+1)]

# Label Encoding for stations and train origins/destinations
station_encoder = LabelEncoder()
origin_encoder = LabelEncoder()
destination_encoder = LabelEncoder()

# Fit station encoder across all prev_station columns
all_stations = pd.concat([df[col] for col in prev_stations]).dropna().unique()
station_encoder.fit(all_stations)

for col in prev_stations:
    df[col] = df[col].map(lambda x: station_encoder.transform([x])[0] if pd.notnull(x) else 0)

df['TrainOrigin'] = origin_encoder.fit_transform(df['TrainOrigin'])
df['TrainDestination'] = destination_encoder.fit_transform(df['TrainDestination'])

# Features and target
X = {
    "TrainOrigin": df['TrainOrigin'].values,
    "TrainDestination": df['TrainDestination'].values,
    "scheduled_hour": df['scheduled_hour'].values,
    "day_of_week": df['day_of_week'].values,
    "month_of_year": df['month_of_year'].values,
    "PrevStations": df[prev_stations].values,
    "PrevDelays": df[prev_delays].values,
}
y = df['delay_minutes'].values


In [4]:
# Train/Test split
split_indices = int(0.8 * len(y))  # 80% for train

X_train = {k: v[:split_indices] for k, v in X.items()}
X_test = {k: v[split_indices:] for k, v in X.items()}

y_train = y[:split_indices]
y_test = y[split_indices:]

In [5]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim)]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs, training=training)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)

        ffn_output = self.ffn(out1, training=training)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


In [8]:
from tensorflow.keras import layers, Model

# Inputs
prev_station_input = layers.Input(shape=(10,), dtype="int32", name="PrevStations")
prev_delay_input = layers.Input(shape=(10,), dtype="float32", name="PrevDelays")

# Embedding for station IDs
station_embedded = layers.Embedding(
    input_dim=len(station_encoder.classes_) + 1,  # +1 for unknown
    output_dim=16,
    name="StationEmbedding"
)(prev_station_input)

# Reshape delays
prev_delay_reshaped = layers.Reshape((10, 1), name="DelayReshape")(prev_delay_input)

# Concatenate station embeddings and delay features
station_delay_concat = layers.Concatenate(axis=-1, name="StationDelayConcat")([
    station_embedded, prev_delay_reshaped
])

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim)
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs, training=training)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# Transformer layer
transformer_block = TransformerBlock(embed_dim=17, num_heads=4, ff_dim=64)
x = transformer_block(station_delay_concat)

# Global average pooling
x = layers.GlobalAveragePooling1D(name="GlobalAvgPool")(x)

# Fully connected layers
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.3)(x)
x = layers.Dense(32, activation="relu")(x)
x = layers.Dropout(0.3)(x)

# Final output: 1 number (delay in minutes)
output = layers.Dense(1, name="Output")(x)

# Model
model = Model(
    inputs=[prev_station_input, prev_delay_input],
    outputs=output,
    name="TrainDelayTransformer"
)

model.compile(
    optimizer="adam",
    loss="mse",
    metrics=["mae"]
)

model.summary()


In [None]:
history = model.fit(
    {"PrevStations": X_train_prev_stations, "PrevDelays": X_train_prev_delays},
    y_train,
    validation_split=0.1,
    batch_size=32,
    epochs=50
)

In [9]:
# 📚 Imports
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# 📖 Load Data
df = pd.read_csv("dublin_connolly_clean_with_history.csv")

# 🔧 Basic Preprocessing
prev_stations_num = 10
prev_station_cols = [f'prev_station_{i}' for i in range(1, prev_stations_num+1)]
prev_delay_cols = [f'prev_delay_{i}' for i in range(1, prev_stations_num+1)]

# Encode station names
station_encoder = LabelEncoder()
all_station_names = pd.concat([df[col] for col in prev_station_cols]).dropna().unique()
station_encoder.fit(all_station_names)

for col in prev_station_cols:
    df[col] = df[col].map(lambda x: station_encoder.transform([x])[0] if pd.notna(x) else 0)

# Fill missing delays with 0
df[prev_delay_cols] = df[prev_delay_cols].fillna(0.0)

# Prepare features
X_prev_stations = df[prev_station_cols].astype(np.int32).values
X_prev_delays = df[prev_delay_cols].astype(np.float32).values
y = df['delay_minutes'].values

# 🔀 Train/Test Split
X_train_stations, X_test_stations, X_train_delays, X_test_delays, y_train, y_test = train_test_split(
    X_prev_stations, X_prev_delays, y, test_size=0.2, random_state=42
)

# 🧱 Build Transformer Block
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs, training=training)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# 🏗️ Build the Model
# Inputs
prev_station_input = layers.Input(shape=(10,), dtype="int32", name="PrevStations")
prev_delay_input = layers.Input(shape=(10,), dtype="float32", name="PrevDelays")

# Station embedding
station_embedded = layers.Embedding(
    input_dim=len(station_encoder.classes_) + 1,
    output_dim=16,
    name="StationEmbedding"
)(prev_station_input)

# Reshape delay input
prev_delay_reshaped = layers.Reshape((10, 1), name="DelayReshape")(prev_delay_input)

# Concatenate embedding and delay
station_delay_concat = layers.Concatenate(axis=-1, name="StationDelayConcat")([
    station_embedded, prev_delay_reshaped
])

# Transformer encoder
transformer_block = TransformerBlock(embed_dim=17, num_heads=4, ff_dim=64)
x = transformer_block(station_delay_concat)

# Pool and Dense Layers
x = layers.GlobalAveragePooling1D(name="GlobalAvgPool")(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.3)(x)
x = layers.Dense(32, activation="relu")(x)
x = layers.Dropout(0.3)(x)

# Output
output = layers.Dense(1, name="Output")(x)

# Final Model
model = Model(inputs=[prev_station_input, prev_delay_input], outputs=output)

# ⚙️ Compile
model.compile(
    optimizer="adam",
    loss="mse",
    metrics=["mae"]
)

model.summary()


In [10]:
# Train the model
history = model.fit(
    {"PrevStations": X_train_stations, "PrevDelays": X_train_delays},
    y_train,
    validation_split=0.1,
    batch_size=32,
    epochs=50,
    verbose=1
)


Epoch 1/50


I0000 00:00:1745681683.699173   66746 service.cc:152] XLA service 0x770b74019690 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1745681683.699187   66746 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 2060, Compute Capability 7.5
2025-04-26 16:34:43.746346: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1745681684.051410   66746 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 87/335[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step - loss: 16.4515 - mae: 2.4031

I0000 00:00:1745681686.219031   66746 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 18.0848 - mae: 2.1342




[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - loss: 18.1068 - mae: 2.1337 - val_loss: 7.3839 - val_mae: 1.5296
Epoch 2/50
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 16.9365 - mae: 1.7286 - val_loss: 6.4271 - val_mae: 1.4015
Epoch 3/50
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 13.2743 - mae: 1.6571 - val_loss: 6.6313 - val_mae: 1.4848
Epoch 4/50
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 34.7462 - mae: 1.7364 - val_loss: 7.5469 - val_mae: 1.4516
Epoch 5/50
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 33.2570 - mae: 1.7074 - val_loss: 6.4540 - val_mae: 1.4161
Epoch 6/50
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 31.7900 - mae: 1.6823 - val_loss: 6.6388 - val_mae: 1.4198
Epoch 7/50
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss

In [11]:
# Evaluate on test set
test_results = model.evaluate(
    {"PrevStations": X_test_stations, "PrevDelays": X_test_delays},
    y_test,
    verbose=1
)

print(f"Test MAE: {test_results[1]:.2f} minutes")


[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 4.9770 - mae: 1.3541
Test MAE: 1.36 minutes


In [12]:
# 📚 Imports
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# 📖 Load Data
df = pd.read_csv("dublin_connolly_clean_with_history.csv")

# 🔧 Basic Preprocessing
prev_stations_num = 10
prev_station_cols = [f'prev_station_{i}' for i in range(1, prev_stations_num+1)]
prev_delay_cols = [f'prev_delay_{i}' for i in range(1, prev_stations_num+1)]

# Encode station names
station_encoder = LabelEncoder()
all_station_names = pd.concat([df[col] for col in prev_station_cols]).dropna().unique()
station_encoder.fit(all_station_names)

for col in prev_station_cols:
    df[col] = df[col].map(lambda x: station_encoder.transform([x])[0] if pd.notna(x) else 0)

# Fill missing delays with 0
df[prev_delay_cols] = df[prev_delay_cols].fillna(0.0)

# Prepare features
X_prev_stations = df[prev_station_cols].astype(np.int32).values
X_prev_delays = df[prev_delay_cols].astype(np.float32).values
y = df['delay_minutes'].values

# 🔀 Train/Test Split
X_train_stations, X_test_stations, X_train_delays, X_test_delays, y_train, y_test = train_test_split(
    X_prev_stations, X_prev_delays, y, test_size=0.2, random_state=42
)

# 🧱 Positional Encoding Layer
class PositionalEncoding(layers.Layer):
    def __init__(self, sequence_length, d_model):
        super().__init__()
        self.pos_encoding = self.positional_encoding(sequence_length, d_model)

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'pos_encoding': self.pos_encoding,
        })
        return config

    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(
            np.arange(position)[:, np.newaxis],
            np.arange(d_model)[np.newaxis, :],
            d_model
        )
        # apply sin to even indices
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        # apply cos to odd indices
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        pos_encoding = angle_rads[np.newaxis, ...]
        return tf.cast(pos_encoding, dtype=tf.float32)

    def get_angles(self, pos, i, d_model):
        angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
        return pos * angle_rates

    def call(self, inputs):
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]

# 🧱 Transformer Encoder Block
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs, training=training)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# 🏗️ Build the Model
# Inputs
prev_station_input = layers.Input(shape=(10,), dtype="int32", name="PrevStations")
prev_delay_input = layers.Input(shape=(10,), dtype="float32", name="PrevDelays")

# Station embedding
station_embedded = layers.Embedding(
    input_dim=len(station_encoder.classes_) + 1,
    output_dim=32,
    name="StationEmbedding"
)(prev_station_input)

# Reshape delay input
prev_delay_reshaped = layers.Reshape((10, 1), name="DelayReshape")(prev_delay_input)

# Concatenate embeddings and delays
x = layers.Concatenate(axis=-1)([station_embedded, prev_delay_reshaped])

# Project to common dimension
x = layers.Dense(64, activation="relu")(x)

# Add Positional Encoding
x = PositionalEncoding(sequence_length=10, d_model=64)(x)

# Transformer Encoder Layers
transformer_block = TransformerBlock(embed_dim=64, num_heads=4, ff_dim=128)
x = transformer_block(x)

# Optional: stack another transformer block for deeper learning
x = transformer_block(x)

# Pooling
x = layers.GlobalAveragePooling1D()(x)

# Fully connected layers
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.3)(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.3)(x)

# Output
output = layers.Dense(1, name="Output")(x)

# Final Model
model = Model(inputs=[prev_station_input, prev_delay_input], outputs=output)

# ⚙️ Compile
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss="mse",
    metrics=["mae"]
)

model.summary()


In [15]:
# Train
history = model.fit(
    {"PrevStations": X_train_stations, "PrevDelays": X_train_delays},
    y_train,
    validation_split=0.1,
    batch_size=32,
    epochs=200,
    verbose=1
)


Epoch 1/200
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 31.9551 - mae: 2.1365 - val_loss: 10.8304 - val_mae: 1.9930
Epoch 2/200
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 12.1725 - mae: 2.0156 - val_loss: 10.0142 - val_mae: 1.8960
Epoch 3/200
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 9.5338 - mae: 1.8908 - val_loss: 10.0140 - val_mae: 1.9522
Epoch 4/200
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 10.6683 - mae: 1.9660 - val_loss: 10.0558 - val_mae: 1.8647
Epoch 5/200
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 13.1770 - mae: 1.9910 - val_loss: 10.5808 - val_mae: 1.9879
Epoch 6/200
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 83.2197 - mae: 2.2931 - val_loss: 10.9107 - val_mae: 1.9301
Epoch 7/200
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [16]:
# Evaluate
test_results = model.evaluate(
    {"PrevStations": X_test_stations, "PrevDelays": X_test_delays},
    y_test,
    verbose=1
)

print(f"Test MAE: {test_results[1]:.2f} minutes")


[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 9.6376 - mae: 1.8700
Test MAE: 1.87 minutes
