In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Load cleaned data
df = pd.read_csv('dublin_connolly_clean_with_history.csv')

prev_stations_num = 10
prev_stations = [f'prev_station_{i}' for i in range(1, prev_stations_num+1)]

# Encode categorical features
categorical_cols = ['TrainOrigin', 'TrainDestination'] + prev_stations
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded = encoder.fit_transform(df[categorical_cols])

encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_cols))
df_final = pd.concat([df.drop(columns=categorical_cols), encoded_df], axis=1)

# Define features and target
X = df_final.drop(columns=['delay_minutes'])
y = df_final['delay_minutes']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Evaluation
def evaluate(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    print(f"📊 {model_name} Results:")
    print(f"   MAE : {mae:.2f} minutes")
    print(f"   RMSE: {rmse:.2f} minutes\n")

evaluate(y_test, y_pred_lr, "Linear Regression")
evaluate(y_test, y_pred_rf, "Random Forest Regressor")


📊 Linear Regression Results:
   MAE : 1.31 minutes
   RMSE: 2.68 minutes

📊 Random Forest Regressor Results:
   MAE : 1.16 minutes
   RMSE: 2.15 minutes



In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense

# Load the cleaned dataset
df = pd.read_csv('dublin_connolly_clean_with_history.csv')

prev_stations_num = 10
prev_stations = [f'prev_station_{i}' for i in range(1, prev_stations_num+1)]

# Encode categorical features
categorical_cols = ['TrainOrigin', 'TrainDestination'] + prev_stations
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded = encoder.fit_transform(df[categorical_cols])

encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_cols))
df_final = pd.concat([df.drop(columns=categorical_cols), encoded_df], axis=1)

# Define features and target
X = df_final.drop(columns=['delay_minutes'])
y = df_final['delay_minutes']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Reshape input for RNN: [samples, time steps, features]
# Here we assume each sample is one timestep (you could adapt for sequences if needed)
X_train_rnn = np.expand_dims(X_train, axis=1)  # Shape: (samples, 1, features)
X_test_rnn = np.expand_dims(X_test, axis=1)

# Build RNN model
model = Sequential([
    SimpleRNN(64, activation='tanh', input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2])),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer for regression
])

model.compile(optimizer='adam', loss='mse')

# Train the model
history = model.fit(X_train_rnn, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=1)

# Predict on the test set
y_pred_rnn = model.predict(X_test_rnn).flatten()

# Evaluation function
def evaluate(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    print(f"📊 {model_name} Results:")
    print(f"   MAE : {mae:.2f} minutes")
    print(f"   RMSE: {rmse:.2f} minutes\n")

# Evaluate RNN
evaluate(y_test, y_pred_rnn, "Recurrent Neural Network (RNN)")


Epoch 1/50


  super().__init__(**kwargs)


[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 52.7274 - val_loss: 8.1231
Epoch 2/50
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 21.6189 - val_loss: 6.6613
Epoch 3/50
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 19.3167 - val_loss: 6.6667
Epoch 4/50
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 15.1450 - val_loss: 6.2563
Epoch 5/50
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 12.8019 - val_loss: 5.7192
Epoch 6/50
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 31.5439 - val_loss: 5.5445
Epoch 7/50
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 6.5268 - val_loss: 5.0341
Epoch 8/50
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 14.3440 - val_loss: 4.7758
Epoch 9/50
[1m335/335[0m [32m━━━━━━━━━━━━

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Load the cleaned dataset
df = pd.read_csv('dublin_connolly_clean_with_history.csv')

prev_stations_num = 10
prev_stations = [f'prev_station_{i}' for i in range(1, prev_stations_num+1)]

# Encode categorical features
categorical_cols = ['TrainOrigin', 'TrainDestination'] + prev_stations
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded = encoder.fit_transform(df[categorical_cols])

encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_cols))
df_final = pd.concat([df.drop(columns=categorical_cols), encoded_df], axis=1)

# Define features and target
X = df_final.drop(columns=['delay_minutes'])
y = df_final['delay_minutes']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Reshape input for LSTM: [samples, time steps, features]
X_train_lstm = np.expand_dims(X_train, axis=1)  # One time step per sample
X_test_lstm = np.expand_dims(X_test, axis=1)

# Build LSTM model
model = Sequential([
    LSTM(64, activation='tanh', input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer for regression
])

model.compile(optimizer='adam', loss='mse')

# Train the model
history = model.fit(X_train_lstm, y_train, epochs=50, batch_size=32, validation_split=0.1, verbose=1)

# Predict on the test set
y_pred_lstm = model.predict(X_test_lstm).flatten()

# Evaluation function
def evaluate(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    print(f"📊 {model_name} Results:")
    print(f"   MAE : {mae:.2f} minutes")
    print(f"   RMSE: {rmse:.2f} minutes\n")

# Evaluate LSTM
evaluate(y_test, y_pred_lstm, "Long Short-Term Memory (LSTM)")


Epoch 1/50


  super().__init__(**kwargs)


[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 23.1736 - val_loss: 7.0057
Epoch 2/50
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 21.7590 - val_loss: 6.4092
Epoch 3/50
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 21.7703 - val_loss: 6.3203
Epoch 4/50
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 23.1293 - val_loss: 6.0988
Epoch 5/50
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 16.5447 - val_loss: 6.2140
Epoch 6/50
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 14.5547 - val_loss: 5.9176
Epoch 7/50
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 15.7983 - val_loss: 5.9144
Epoch 8/50
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 27.9917 - val_loss: 5.9328
Epoch 9/50
[1m335/335[0m [32m━━━━━━━━━━━

In [15]:
df

Unnamed: 0,TrainOrigin,TrainDestination,scheduled_hour,day_of_week,prev_station_1,prev_delay_1,prev_station_2,prev_delay_2,prev_station_3,prev_delay_3,...,prev_delay_6,prev_station_7,prev_delay_7,prev_station_8,prev_delay_8,prev_station_9,prev_delay_9,prev_station_10,prev_delay_10,delay_minutes
0,Dublin Connolly,Belfast,8,1,,0.0,,0.0,,0.0,...,0.0,,0.0,,0.0,,0.0,,0.0,5.2
1,Dublin Connolly,Belfast,8,5,,0.0,,0.0,,0.0,...,0.0,,0.0,,0.0,,0.0,,0.0,1.1
2,Dublin Connolly,Belfast,8,5,,0.0,,0.0,,0.0,...,0.0,,0.0,,0.0,,0.0,,0.0,0.3
3,Dublin Connolly,Belfast,8,4,,0.0,,0.0,,0.0,...,0.0,,0.0,,0.0,,0.0,,0.0,4.5
4,Dublin Connolly,Portadown,8,1,,0.0,,0.0,,0.0,...,0.0,,0.0,,0.0,,0.0,,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14878,Greystones,Howth,9,1,Tara Street,1.8,Dublin Pearse,1.9,Grand Canal Dock,2.4,...,2.8,Booterstown,2.8,Blackrock,2.2,Seapoint,2.0,Salthill and Monkstown,2.0,2.6
14879,Bray,Howth,9,2,Tara Street,-0.4,Dublin Pearse,-0.1,Grand Canal Dock,0.2,...,0.5,Booterstown,0.5,Blackrock,0.3,Seapoint,0.3,Salthill and Monkstown,0.4,2.1
14880,Bray,Howth,9,4,Tara Street,0.1,Dublin Pearse,0.5,Grand Canal Dock,0.2,...,0.8,Booterstown,1.5,Blackrock,0.8,Seapoint,0.9,Salthill and Monkstown,0.9,0.5
14881,Bray,Howth,9,1,Tara Street,-0.1,Dublin Pearse,0.0,Grand Canal Dock,-0.1,...,0.6,Booterstown,0.9,Blackrock,0.4,Seapoint,0.5,Salthill and Monkstown,0.6,0.7
