## Neural Network for Bus ETA Calculation

This neural network is designed to calculate the Estimated Time of Arrival (ETA) for buses. It takes the following inputs:

- **Route ID**: The unique identifier for the bus route.
- **Starting Stop**: The bus stop where the journey begins.
- **Destination Stop**: The bus stop where the journey ends.
- **Weather Data**: Including precipitation, temperature, etc.
- **Time of Day**: The specific time when the journey starts.
- **Day of the Week**: The day on which the journey takes place.

Using these inputs, the neural network processes the data and outputs the duration of the journey in minutes and seconds. This helps in providing accurate and reliable ETA for bus passengers.

In [152]:
# Import required libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [153]:
# Check if the model is running on GPU or CPU
print(torch.cuda.is_available()) 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

True


In [154]:
# Load data using pandas
df = pd.read_parquet("../data/mega_df.parquet")

# Display the first few rows
df.head()

Unnamed: 0,routeID,patternID,equipmentID,tripID,lat,lng,load,capacity,eLoad,blockID,...,nextToLast_right,timeDiff_right,receiveTime_right,nextStopID_actual,nextStopName_actual,eta,hour_of_day,minute_of_hour,day_of_week,eta_seconds
0,2L,3,Old_1501,329.0,41.93639,-88.7736,0,0,33,0,...,True,0 days 00:01:00,2024-09-01 00:05:21,465.0,Stadium Dr. at New Hall,0 days 00:00:40,0,4,7,40.0
1,2L,3,1103,330.0,41.93639,-88.77435,0,0,0,0,...,True,0 days 00:00:50,2024-09-01 00:35:16,465.0,Stadium Dr. at New Hall,0 days 00:00:41,0,34,7,41.0
2,2L,3,1103,331.0,41.93638,-88.77397,0,0,8,0,...,True,0 days 00:01:00,2024-09-01 01:04:25,465.0,Stadium Dr. at New Hall,0 days 00:00:51,1,3,7,51.0
3,2L,3,1103,332.0,41.93639,-88.77435,0,0,18,0,...,True,0 days 00:00:50,2024-09-01 01:34:37,465.0,Stadium Dr. at New Hall,0 days 00:00:51,1,33,7,51.0
4,2L,3,1103,338.0,41.9364,-88.77528,0,0,31,0,...,True,0 days 00:00:50,2024-09-01 02:04:35,465.0,Stadium Dr. at New Hall,0 days 00:00:30,2,4,7,30.0


In [155]:
# Drop any missing values
df = df.dropna()

#Check if there are any missing values after dropping them
print(df.isna().any().any()) 

False


In [156]:
# Encode categorical features using LabelEncoder
label_encoders = {}
categorical_features = ['routeID', 'lastStopID', 'nextStopID_actual', 'day_of_week', 'hour_of_day', 'minute_of_hour']

for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store the encoders for future decoding

# Display transformed data
df.head()


Unnamed: 0,routeID,patternID,equipmentID,tripID,lat,lng,load,capacity,eLoad,blockID,...,nextToLast_right,timeDiff_right,receiveTime_right,nextStopID_actual,nextStopName_actual,eta,hour_of_day,minute_of_hour,day_of_week,eta_seconds
0,7,3,Old_1501,329.0,41.93639,-88.7736,0,0,33,0,...,True,0 days 00:01:00,2024-09-01 00:05:21,30,Stadium Dr. at New Hall,0 days 00:00:40,0,4,6,40.0
1,7,3,1103,330.0,41.93639,-88.77435,0,0,0,0,...,True,0 days 00:00:50,2024-09-01 00:35:16,30,Stadium Dr. at New Hall,0 days 00:00:41,0,34,6,41.0
2,7,3,1103,331.0,41.93638,-88.77397,0,0,8,0,...,True,0 days 00:01:00,2024-09-01 01:04:25,30,Stadium Dr. at New Hall,0 days 00:00:51,1,3,6,51.0
3,7,3,1103,332.0,41.93639,-88.77435,0,0,18,0,...,True,0 days 00:00:50,2024-09-01 01:34:37,30,Stadium Dr. at New Hall,0 days 00:00:51,1,33,6,51.0
4,7,3,1103,338.0,41.9364,-88.77528,0,0,31,0,...,True,0 days 00:00:50,2024-09-01 02:04:35,30,Stadium Dr. at New Hall,0 days 00:00:30,2,4,6,30.0


In [157]:
# Normalize numerical features using MixScalers
numerical_features = ['hour_of_day', 'minute_of_hour']
scaler = MinMaxScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Display normalized data
df.head()

Unnamed: 0,routeID,patternID,equipmentID,tripID,lat,lng,load,capacity,eLoad,blockID,...,nextToLast_right,timeDiff_right,receiveTime_right,nextStopID_actual,nextStopName_actual,eta,hour_of_day,minute_of_hour,day_of_week,eta_seconds
0,7,3,Old_1501,329.0,41.93639,-88.7736,0,0,33,0,...,True,0 days 00:01:00,2024-09-01 00:05:21,30,Stadium Dr. at New Hall,0 days 00:00:40,0.0,0.067797,6,40.0
1,7,3,1103,330.0,41.93639,-88.77435,0,0,0,0,...,True,0 days 00:00:50,2024-09-01 00:35:16,30,Stadium Dr. at New Hall,0 days 00:00:41,0.0,0.576271,6,41.0
2,7,3,1103,331.0,41.93638,-88.77397,0,0,8,0,...,True,0 days 00:01:00,2024-09-01 01:04:25,30,Stadium Dr. at New Hall,0 days 00:00:51,0.043478,0.050847,6,51.0
3,7,3,1103,332.0,41.93639,-88.77435,0,0,18,0,...,True,0 days 00:00:50,2024-09-01 01:34:37,30,Stadium Dr. at New Hall,0 days 00:00:51,0.043478,0.559322,6,51.0
4,7,3,1103,338.0,41.9364,-88.77528,0,0,31,0,...,True,0 days 00:00:50,2024-09-01 02:04:35,30,Stadium Dr. at New Hall,0 days 00:00:30,0.086957,0.067797,6,30.0


In [158]:
# Normalize target variable
scaler_y = StandardScaler()  # Use MinMaxScaler() if you prefer [0,1] range
df['eta_seconds'] = scaler_y.fit_transform(df[['eta_seconds']])  # Normalize target

In [159]:
# Define feature matrix (X) and target vector (y)
df = df.dropna(subset=['eta_seconds'])  # Drop rows with NaN in target

# Convert to PyTorch tensors
X_tensor = torch.tensor(df[categorical_features + numerical_features].values, dtype=torch.float32)
y_tensor = torch.tensor(df['eta_seconds'].values, dtype=torch.float32).view(-1, 1)

# Print tensor shapes
print(f"Feature tensor shape: {X_tensor.shape}")
print(f"Target tensor shape: {y_tensor.shape}")

Feature tensor shape: torch.Size([643456, 8])
Target tensor shape: torch.Size([643456, 1])


In [160]:
# Move tensors to GPU if available
X_tensor = X_tensor.to(device)
y_tensor = y_tensor.to(device)

In [161]:
print(torch.isnan(X_tensor).sum())  # Should be 0
print(torch.isinf(X_tensor).sum())  # Should be 0
print(torch.isnan(y_tensor).sum())  # Should be 0
print(torch.isinf(y_tensor).sum())  # Should be 0

tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')


In [162]:
# Define a PyTorch dataset class
class ETADataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create dataset
dataset = ETADataset(X_tensor, y_tensor)

# Print dataset length
print(f"Dataset size: {len(dataset)}")

Dataset size: 643456


In [163]:
# Define train-validation split
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

# Perform split
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

print(f"Training samples: {len(train_dataset)}, Validation samples: {len(val_dataset)}")

Training samples: 514764, Validation samples: 128692


In [164]:
# Define the ETA prediction neural network
class ETANetwork(nn.Module):
    def __init__(self, input_size):
        super(ETANetwork, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
    
    def forward(self, x):
        return self.model(x)

# Initialize model
model = ETANetwork(input_size=X.shape[1])

model.to(device)

# Print model architecture
print(model)

ETANetwork(
  (model): Sequential(
    (0): Linear(in_features=8, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=1, bias=True)
  )
)


In [165]:
# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

In [None]:
# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=20):
    print("Training Started 🚀")
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)

             # Clip gradients to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                predictions = model(X_batch)
                loss = criterion(predictions, y_batch)
                val_loss += loss.item()
        
        
        # Compute averages
        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)

        #Print results for monitoring
        print(f"Epoch {epoch+1}/{epochs} ➝ "
            f"Train Loss: {avg_train_loss:.4f} | "
            f"Val Loss: {avg_val_loss:.4f}")

        # Overfitting warning
        if avg_train_loss < avg_val_loss * 0.9:
            print("⚠️ Warning: Possible overfitting detected!")
            
    print("Training complete ✅")
# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=5)

Training Started 🚀
Epoch 1/5 ➝ Train Loss: 0.9841 | Val Loss: 0.9640
Training complete ✅
Epoch 2/5 ➝ Train Loss: 0.9824 | Val Loss: 0.9625
Training complete ✅
Epoch 3/5 ➝ Train Loss: 0.9810 | Val Loss: 0.9675
Training complete ✅
Epoch 4/5 ➝ Train Loss: 0.9803 | Val Loss: 0.9614
Training complete ✅


In [167]:
# Example: Make predictions on validation set
model.eval()
with torch.no_grad():
    sample_X, sample_y = next(iter(val_loader))
    predictions = model(sample_X)

# Convert to NumPy for easy viewing
predictions = predictions.cpu().numpy()
sample_y = sample_y.cpu().numpy()

# Print first few predictions vs actual values
for i in range(5):
    print(f"Predicted ETA: {predictions[i][0]:.2f}, Actual ETA: {sample_y[i][0]:.2f}")

Predicted ETA: -0.09, Actual ETA: -0.10
Predicted ETA: -0.02, Actual ETA: -0.10
Predicted ETA: 0.02, Actual ETA: -0.10
Predicted ETA: -0.00, Actual ETA: -0.10
Predicted ETA: -0.02, Actual ETA: -0.10
