## Neural Network for Bus ETA Calculation

This neural network is designed to calculate the Estimated Time of Arrival (ETA) for buses. It takes the following inputs:

- **Route ID**: The unique identifier for the bus route.
- **Starting Stop**: The bus stop where the journey begins.
- **Destination Stop**: The bus stop where the journey ends.
- **Weather Data**: Including precipitation, temperature, etc.
- **Time of Day**: The specific time when the journey starts.
- **Day of the Week**: The day on which the journey takes place.

Using these inputs, the neural network processes the data and outputs the duration of the journey in minutes and seconds. This helps in providing accurate and reliable ETA for bus passengers.

In [28]:
# Install necessary packages if not already installed
#!pip install pandas torch scikit-learn

# Import required libraries
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [29]:
# Load data using pandas
df = pd.read_parquet("../data/mega_df.parquet")

# Display the first few rows
df.head()

Unnamed: 0,routeID,patternID,equipmentID,tripID,lat,lng,load,capacity,eLoad,blockID,...,stopChanged_right,nextToLast_right,timeDiff_right,receiveTime_right,nextStopID_actual,nextStopName_actual,timeDiff_n_stops_ahead,hour_of_day,minute_of_hour,day_of_week
0,2L,3,Old_1501,329.0,41.93639,-88.7736,0,0,33,0,...,True,True,0 days 00:01:00,2024-09-01 00:05:21,465.0,Stadium Dr. at New Hall,0 days 00:00:40,0,4,7
1,2L,3,1103,330.0,41.93639,-88.77435,0,0,0,0,...,True,True,0 days 00:00:50,2024-09-01 00:35:16,465.0,Stadium Dr. at New Hall,0 days 00:00:41,0,34,7
2,2L,3,1103,331.0,41.93638,-88.77397,0,0,8,0,...,True,True,0 days 00:01:00,2024-09-01 01:04:25,465.0,Stadium Dr. at New Hall,0 days 00:00:51,1,3,7
3,2L,3,1103,332.0,41.93639,-88.77435,0,0,18,0,...,True,True,0 days 00:00:50,2024-09-01 01:34:37,465.0,Stadium Dr. at New Hall,0 days 00:00:51,1,33,7
4,2L,3,1103,338.0,41.9364,-88.77528,0,0,31,0,...,True,True,0 days 00:00:50,2024-09-01 02:04:35,465.0,Stadium Dr. at New Hall,0 days 00:00:30,2,4,7


In [30]:
# Encode categorical features using LabelEncoder
label_encoders = {}
categorical_features = ['routeID', 'lastStopID', 'nextStopID_actual', 'day_of_week', 'hour_of_day', 'minute_of_hour']

for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store the encoders for future decoding

# Display transformed data
df.head()


Unnamed: 0,routeID,patternID,equipmentID,tripID,lat,lng,load,capacity,eLoad,blockID,...,stopChanged_right,nextToLast_right,timeDiff_right,receiveTime_right,nextStopID_actual,nextStopName_actual,timeDiff_n_stops_ahead,hour_of_day,minute_of_hour,day_of_week
0,9,3,Old_1501,329.0,41.93639,-88.7736,0,0,33,0,...,True,True,0 days 00:01:00,2024-09-01 00:05:21,30,Stadium Dr. at New Hall,0 days 00:00:40,0,4,6
1,9,3,1103,330.0,41.93639,-88.77435,0,0,0,0,...,True,True,0 days 00:00:50,2024-09-01 00:35:16,30,Stadium Dr. at New Hall,0 days 00:00:41,0,34,6
2,9,3,1103,331.0,41.93638,-88.77397,0,0,8,0,...,True,True,0 days 00:01:00,2024-09-01 01:04:25,30,Stadium Dr. at New Hall,0 days 00:00:51,1,3,6
3,9,3,1103,332.0,41.93639,-88.77435,0,0,18,0,...,True,True,0 days 00:00:50,2024-09-01 01:34:37,30,Stadium Dr. at New Hall,0 days 00:00:51,1,33,6
4,9,3,1103,338.0,41.9364,-88.77528,0,0,31,0,...,True,True,0 days 00:00:50,2024-09-01 02:04:35,30,Stadium Dr. at New Hall,0 days 00:00:30,2,4,6


In [32]:
# Normalize numerical features using StandardScaler
numerical_features = ['day_of_week', 'hour_of_day', 'minute_of_hour']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Display normalized data
df.head()

Unnamed: 0,routeID,patternID,equipmentID,tripID,lat,lng,load,capacity,eLoad,blockID,...,stopChanged_right,nextToLast_right,timeDiff_right,receiveTime_right,nextStopID_actual,nextStopName_actual,timeDiff_n_stops_ahead,hour_of_day,minute_of_hour,day_of_week
0,9,3,Old_1501,329.0,41.93639,-88.7736,0,0,33,0,...,True,True,0 days 00:01:00,2024-09-01 00:05:21,30,Stadium Dr. at New Hall,0 days 00:00:40,-2.113328,-1.441737,1.731357
1,9,3,1103,330.0,41.93639,-88.77435,0,0,0,0,...,True,True,0 days 00:00:50,2024-09-01 00:35:16,30,Stadium Dr. at New Hall,0 days 00:00:41,-2.113328,0.364728,1.731357
2,9,3,1103,331.0,41.93638,-88.77397,0,0,8,0,...,True,True,0 days 00:01:00,2024-09-01 01:04:25,30,Stadium Dr. at New Hall,0 days 00:00:51,-1.963695,-1.501952,1.731357
3,9,3,1103,332.0,41.93639,-88.77435,0,0,18,0,...,True,True,0 days 00:00:50,2024-09-01 01:34:37,30,Stadium Dr. at New Hall,0 days 00:00:51,-1.963695,0.304512,1.731357
4,9,3,1103,338.0,41.9364,-88.77528,0,0,31,0,...,True,True,0 days 00:00:50,2024-09-01 02:04:35,30,Stadium Dr. at New Hall,0 days 00:00:30,-1.814061,-1.441737,1.731357


In [34]:
# Define feature matrix (X) and target vector (y)
X = df[categorical_features + numerical_features].values
y = df['timeDiff_n_stops_ahead'].values.reshape(-1, 1)

# Convert to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

# Print tensor shapes
print(f"Feature tensor shape: {X_tensor.shape}")
print(f"Target tensor shape: {y_tensor.shape}")

TypeError: can't convert np.ndarray of type numpy.timedelta64. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.

In [None]:
# Define a PyTorch dataset class
class ETADataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create dataset
dataset = ETADataset(X_tensor, y_tensor)

# Print dataset length
print(f"Dataset size: {len(dataset)}")

In [None]:
# Define train-validation split
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

# Perform split
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

print(f"Training samples: {len(train_dataset)}, Validation samples: {len(val_dataset)}")

In [None]:
# Define the ETA prediction neural network
class ETANetwork(nn.Module):
    def __init__(self, input_size):
        super(ETANetwork, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
    
    def forward(self, x):
        return self.model(x)

# Initialize model
model = ETANetwork(input_size=X.shape[1])

# Print model architecture
print(model)

In [None]:
# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=20):
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                predictions = model(X_batch)
                loss = criterion(predictions, y_batch)
                val_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")

# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=20)

In [None]:
# Example: Make predictions on validation set
model.eval()
with torch.no_grad():
    sample_X, sample_y = next(iter(val_loader))
    predictions = model(sample_X)

# Convert to NumPy for easy viewing
predictions = predictions.numpy()
sample_y = sample_y.numpy()

# Print first few predictions vs actual values
for i in range(5):
    print(f"Predicted ETA: {predictions[i][0]:.2f}, Actual ETA: {sample_y[i][0]:.2f}")