In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

In [3]:
# Load the training data
train_data = pd.read_csv('archive/train.csv')
print(len(train_data))

1710670


In [29]:
# Preprocess the data
# Drop irrelevant columns (TRIP_ID, TAXI_ID, etc.)
# Encode categorical variables (CALL_TYPE, DAY_TYPE)
# Handle missing values (ORIGIN_CALL, ORIGIN_STAND)
preprocessed_data = train_data.drop(['TRIP_ID', 'DAY_TYPE', 'MISSING_DATA'], axis=1)

preprocessed_data = pd.get_dummies(preprocessed_data, columns=['CALL_TYPE'])
preprocessed_data['CALL_TYPE'] = preprocessed_data['CALL_TYPE_A'].astype(int) + preprocessed_data['CALL_TYPE_B'].astype(int) * 2 + preprocessed_data['CALL_TYPE_C'].astype(int) * 3
preprocessed_data = preprocessed_data.drop(['CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C'], axis=1)

preprocessed_data['ORIGIN_CALL'].fillna(0, inplace=True)
preprocessed_data['ORIGIN_STAND'].fillna(0, inplace=True)

preprocessed_data['TAXI_ID'] = preprocessed_data['TAXI_ID'] - 20000000  # Normalize taxi IDs

preprocessed_data['TIMESTAMP'] = pd.to_datetime(preprocessed_data['TIMESTAMP'], unit='s')
preprocessed_data['HOUR'] = preprocessed_data['TIMESTAMP'].dt.hour
preprocessed_data['DAY_OF_WEEK'] = preprocessed_data['TIMESTAMP'].dt.dayofweek
preprocessed_data['MONTH'] = preprocessed_data['TIMESTAMP'].dt.month
#preprocessed_data['YEAR'] = preprocessed_data['TIMESTAMP'].dt.year
preprocessed_data = preprocessed_data.drop('TIMESTAMP', axis=1)

X = preprocessed_data.drop(['POLYLINE'], axis=1)
y = train_data['POLYLINE'].str.count(',') * 15  # Calculate travel time in seconds

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [76]:
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)


In [77]:
print(X_train)

         ORIGIN_CALL  ORIGIN_STAND  TAXI_ID  CALL_TYPE  HOUR  DAY_OF_WEEK  \
8149          6534.0           0.0      198          1    17            1   
825853           0.0           0.0      664          3    17            0   
896733           0.0          27.0      424          2    13            3   
29112            0.0           9.0      492          2    19            5   
767655           0.0           0.0      476          3     4            4   
...              ...           ...      ...        ...   ...          ...   
259178       24662.0           0.0      184          1    18            2   
1414414          0.0          37.0       83          2    10            6   
131932           0.0          25.0      600          2    19            6   
671155           0.0          25.0      451          2    16            4   
121958           0.0           0.0      342          2    14            4   

         MONTH  
8149         7  
825853      12  
896733       1  
29112  

In [73]:
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size)

        out, hidden = self.rnn(x, hidden)
        out = self.fc(out[:, -1, :])

        return out

    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)



In [78]:
input_size = X_train_tensor.size(1)
hidden_size = 64
output_size = 1

model = RNNModel(input_size, hidden_size, output_size)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)



In [79]:
num_epochs = 10
batch_size = 64

for epoch in range(num_epochs):
    permutation = torch.randperm(X_train_tensor.size(0))
    for i in range(0, X_train_tensor.size(0), batch_size):
        indices = permutation[i:i+batch_size]
        batch_X, batch_y = X_train_tensor[indices], y_train_tensor[indices]
        batch_X = batch_X.unsqueeze(1)  # Add an extra dimension

        optimizer.zero_grad()

        output = model(batch_X)
        loss = criterion(output.squeeze(), batch_y)

        loss.backward()
        optimizer.step()

    # Calculate validation loss
    with torch.no_grad():
        val_output = model(X_val_tensor)
        val_loss = criterion(val_output.squeeze(), y_val_tensor)

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}")


RuntimeError: For unbatched 2-D input, hx should also be 2-D but got 3-D tensor

In [65]:
# Convert the validation data to tensors
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1).to(device)

# Evaluate the model on the validation data
model.eval()
with torch.no_grad():
    val_predictions = model(X_val_tensor)
    val_loss = criterion(val_predictions, y_val_tensor)
    print(f"Validation Loss: {val_loss.item()}")

# Convert the predictions back to numpy arrays
val_predictions = val_predictions.cpu().numpy()
print(val_predictions)

# Calculate the root mean squared error (RMSE) on the validation data
val_rmse = np.sqrt(mean_squared_error(y_val, val_predictions))
print(f"Validation RMSE: {val_rmse}")

Validation Loss: 1868750.0
[[1302.2462]
 [1440.9738]
 [1412.7057]
 ...
 [1265.7263]
 [1395.9257]
 [1355.259 ]]
Validation RMSE: 1367.0224058973524
