In [None]:
from math import radians, sin, cos, sqrt, asin
import json
import math
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go

In [None]:
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import torch.optim as optim

# Create Input Dataset

Input Dataset

| City 1      | City 2      | Distance (km) |
|-------------|-------------|---------------|
| Rome        | Milan       |     485       |
| Naples      | Florence    |     410       |
| Venice      | Turin       |     360       |
| Bologna     | Genoa       |     210       |
| Palermo     | Catania     |     190       |

In [None]:
with open("cities.json", "r") as f:
    cities = json.load(f)

In [None]:
len(cities)

In [None]:
def haversine(coords1, coords2):

    R = 6371.0

    lat1, lon1 = coords1
    lat2, lon2 = coords2
    
    lat1, lon1 = radians(lat1), radians(lon1)
    lat2, lon2 = radians(lat2), radians(lon2)
    
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    
    distance = R * c
    
    return round(distance, 1)

In [None]:
distances = {}

for city1, coord1 in cities.items():
    for city2, coord2 in cities.items():
        
        if city1 == city2:
            continue

        distances[city1, city2] = haversine(coord1, coord2)

In [None]:
distances[('Milano', 'Roma')], distances[('Roma', 'Milano')]

In [None]:
len(distances.keys()), len(cities.keys())**2 - len(cities.keys())

# Standardize distances

In [None]:
plt.hist(distances.values(), bins=100)
plt.xlabel('Distance (km)')
plt.show()

In [None]:
def standardize(x, minimum, maximum):
    return (x - minimum) / (maximum - minimum)
    
def inverse(x, minimum, maximum):
    return x * (maximum - minimum) + minimum 

In [None]:
minimum = float(np.min(list(distances.values())))
maximum = float(np.max(list(distances.values())))

normalized = {
    pair: standardize(distance, minimum=minimum, maximum=maximum)
    for pair, distance in distances.items()
}

In [None]:
plt.hist(normalized.values(), bins=100)
plt.xlabel('Normalized Distance')
plt.show()

In [None]:
# city tokenizer

city_to_idx = {city: idx 
               for idx, city 
               in enumerate(list(cities.keys()))}

num_cities = len(cities)

In [None]:
len(city_to_idx), num_cities

In [None]:
class DistanceDataset(Dataset):
    def __init__(self, distances, city_to_idx):
        self.data = [(city_to_idx[city1], city_to_idx[city2], distance) 
                     for (city1, city2), distance 
                     in distances.items()]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        city1, city2, distance = self.data[idx]
        return (
            torch.tensor(city1, dtype=torch.long),
            torch.tensor(city2, dtype=torch.long),
            torch.tensor(distance, dtype=torch.float32),
        )

dataset = DistanceDataset(normalized, city_to_idx)
dataloader = DataLoader(dataset, batch_size=106, shuffle=False)

In [None]:
for batch in dataloader:
    break

In [None]:
batch

# Model

In [None]:
device = 'cuda'

In [None]:
class GeoEmbedding(nn.Module):
    def __init__(self, num_cities, dim=2):
        super(GeoEmbedding, self).__init__()
        self.coords = nn.Embedding(num_cities, dim)
        nn.init.normal_(self.coords.weight, mean=0, std=1)

    def forward(self, city1_id, city2_id):
        # Euclidean distance
        coords1 = self.coords(city1_id)
        coords2 = self.coords(city2_id)
        return torch.norm(coords1 - coords2, dim=1)

In [None]:
model = GeoEmbedding(num_cities).to(device)

In [None]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [None]:
NUM_EPOCHS = 100

In [None]:
for epoch in range(NUM_EPOCHS):
    total_loss = 0.0
    for city1_batch, city2_batch, distance_batch in dataloader:
        city1_batch = city1_batch.to(device)
        city2_batch = city2_batch.to(device)
        distance_batch = distance_batch.to(device)
        optimizer.zero_grad()
        predicted_distances = model(city1_batch, city2_batch)
        loss = criterion(predicted_distances, distance_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if (epoch + 1) % 10 == 0 or epoch == 0:
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {total_loss / len(dataloader):.8f}")

# Results

In [None]:
embeddings = model.coords.weight.detach().cpu().numpy()

In [None]:
x = embeddings[:, 0]
y = embeddings[:, 1]

x = inverse(x, minimum=minimum, maximum=maximum) # in km
y = inverse(y, minimum=minimum, maximum=maximum) # in km

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(x, y)
plt.show()