In [1]:
import torch
import torch.nn as nn
from sklearn.cluster import KMeans
import pandas as pd

import ast
import matplotlib.pyplot as plt
from metrics import haversine


In [2]:
class TaxiMLP(nn.Module):
    def __init__(self, input_size, hidden_size=500, num_clusters=3392):
        super(TaxiMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_clusters)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x


In [3]:
def equirectangular_distance(pred, true, R = 6371):
    lambda_pred, phi_pred = pred[:, 0], pred[:, 1]
    lambda_true, phi_true = true[:, 0], true[:, 1]
    x = (lambda_true - lambda_pred) * torch.cos((phi_true + phi_pred) / 2)
    y = phi_true - phi_pred
    return R * torch.sqrt(x ** 2 + y ** 2)

In [4]:
def compute_destination(output, cluster_centroids):
    return torch.matmul(output, cluster_centroids)

In [5]:
def evaluate_predictions(pred_clusters, true_clusters, cluster_centroids):
    pred_coords = cluster_centroids[pred_clusters]
    true_coords = cluster_centroids[true_clusters]
    distances = haversine(pred_coords, true_coords)
    return distances.mean()

In [None]:
df = pd.read_csv('train_clean.csv')
train_losses = []

In [None]:
kmeans = KMeans(n_clusters=3392, random_state=42)

In [None]:
df['END'] = df['END'].apply(ast.literal_eval)

In [None]:
destination_coordinates = pd.DataFrame(df['END'].tolist(), columns=['longitude', 'latitude'])

In [None]:
df['CLUSTER'] = kmeans.fit_predict(destination_coordinates)

In [None]:
cluster_centers = torch.tensor(kmeans.cluster_centers_, dtype=torch.float)

In [None]:

num_clusters = kmeans.n_clusters

In [None]:
df['CALL_TYPE'].replace({'A': 1, 'B': 2, 'C': 3},inplace=True)
df['DAY_TYPE'].replace({'A': 1, 'B': 2, 'C': 3},inplace=True)

In [None]:
df = df.drop(['END', 'N_POINTS', 'START', 'TRIP_ID'], axis=1)

In [6]:

df = pd.read_csv('train_clean_clustered.csv')

In [7]:
df['POLYLINE'] = df['POLYLINE'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [8]:
polylines_tensors = [torch.tensor(pol, dtype=torch.float) for pol in df['POLYLINE'] if pol]

In [9]:
from torch.nn.utils.rnn import pad_sequence
max_length = max(pol.size(0) for pol in polylines_tensors)
inputs = pad_sequence(polylines_tensors, batch_first=True)

In [10]:
additional_features = torch.tensor(df[['CALL_TYPE', 'DAY_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND']].fillna(0).values, dtype=torch.float)

In [11]:
additional_features = additional_features.unsqueeze(1) 

In [None]:
additional_features = additional_features.repeat(1, inputs.shape[1], 1)

In [None]:
torch.save(additional_features,'nonseq.pt')

In [None]:
inputs = torch.cat((inputs, additional_features), dim=2)

In [None]:
targets = torch.tensor(df['CLUSTER'], dtype=torch.long)

In [None]:
torch.save(targets,'targets.pt')

In [None]:
torch.save(inputs,'inputs.pt')

In [None]:
from torch.utils.data import TensorDataset
dataset = TensorDataset(inputs, targets)

In [None]:
from torch.utils.data import DataLoader
data_loader = DataLoader(dataset, batch_size=64, shuffle=True)

In [None]:
model = TaxiMLP(6, num_clusters=3392)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)


In [None]:
for epoch in range(5):
    epoch_loss_hav = 0
    for inputs, targets in data_loader:
        outputs = model(inputs)
        destinations = compute_destination(outputs, cluster_centers)
        loss = equirectangular_distance(destinations, targets)

        destinations = destinations.cpu().detach().numpy()
        targets = targets.cpu().detach().numpy()
        loss_haversine = haversine(destinations, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss_hav += loss_haversine.item()

    print('epoch: ', epoch)
    print('loss: ', epoch_loss_hav)
    train_losses.append(epoch_loss_hav / len(data_loader))

plt.plot(train_losses, label='Train Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss Over Time (Haversine Distance)')
plt.legend()
plt.show()