# Il giro del mondo in 80 giorni

Questo progetto utilizza un dataset contenente le principali città del mondo, con annessa locazione geografica e altre informazioni, per calcolare il tempo minimo che ci si metterebbe a viaggiare tra due città, il percorso migliore, il percorso più turistico e altre funzionalità.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from geopy import distance as geopy
from dijkstra import DijkstraSPF, Graph
from scipy.spatial import KDTree
from sklearn.neighbors import BallTree
import math
import csv
import time

In [2]:
data_file = './data/worldcities_ascii.csv'
with open(data_file, 'r') as f:
    reader = csv.reader(f)
    all_lines = []
    for row in reader:
        all_lines.append(row)

## Preprocessing

Preprocesso i dati pulendo le virgole in eccesso, corregendo qualche nome e aggiungendo i campi che mi servono.

>TODO: Aggiungere dei campi tipo cose da visitare, prezzo medio ecc...

In [3]:
newLines = []
for line in all_lines:
    newLine = []
    for index, field in enumerate(line):
        if ("." in field) and (index == 5): #Toglie il '.' nei valori della popolazione
            field = field.replace(".", "")
        field = field.replace("Korea, South", "South Korea")
        field = field.replace("Korea, North", "North Korea")
        field = field.replace("Gambia, The", "The Gambia")
        field = field.replace("Micronesia, Federated States Of", "Federated States Of Micronesia")
        field = field.replace("Bahamas, The", "The Bahamas")
        field = field.replace("Saint Helena, Ascension, And Tristan Da Cunha", "Saint Helena, Ascension and Tristan da Cunha")
        field = field.replace("Islamorada, Village of Islands", "Village of Islands Islamorada")
        if index == 6:
            field = field.replace("\n", "")
        newLine.append(field)
    newLines.append(newLine)

#Lo esporto in caso mi servisse in altri progetti
df = pd.DataFrame(newLines[1:], columns=newLines[0])
df["population"].replace({"": 0}, inplace= True) #Correggo alcuni valori che non hanno la popolazione con 0 anche se sarebbe più corretto lasciare null/None
df.to_csv("./data/worldcities_preprocessed.csv", index=False)

In [4]:
#Siccome sono troppe per lavorarci inizio a prenderne solo 1000 a caso
lineIndexes = np.random.choice(range(1, len(newLines)), size=1000, replace=False)
df = pd.DataFrame([newLines[i] for i in lineIndexes], columns=newLines[0])
df["population"].replace({"": 0}, inplace= True)
df

Unnamed: 0,city,lat,lng,country,iso3,population,id
0,Hindupur,13.83,77.49,India,IND,151677,1356805611
1,Forst,51.7333,14.6333,Germany,DEU,18164,1276307453
2,Nanchong,30.7991,106.0784,China,CHN,6183000,1156762337
3,Ishim,56.1167,69.5,Russia,RUS,65142,1643798395
4,Vrchlabi,50.627,15.6095,Czechia,CZE,12340,1203953730
...,...,...,...,...,...,...,...
995,Leczyca,52.0583,19.2,Poland,POL,15593,1616649790
996,Lewisburg,40.9642,-76.8901,United States,USA,5708,1840003531
997,Dent,39.1922,-84.6593,United States,USA,11378,1840005880
998,Ward,35.0117,-91.9577,United States,USA,5358,1840015504


In [5]:
class CityNode:
    
    def __init__(self, cityID: int, cityName: str, lat: float, lng: float, population: int, countryISO3: str):
        self.cityID = cityID
        self.cityName = cityName
        self.lat = lat
        self.lng = lng
        self.coordinates = np.array([lat, lng])
        self.population = population
        self.countryISO3 = countryISO3
        self.neighbour1 = None
        self.neighbour2 = None
        self.neighbour3 = None
        
    
    def updateNeighbour(self, cityIndex: int, position: int):
        if position == 1:
            self.neighbour1 = cityIndex
        elif position == 2:
            self.neighbour2 = cityIndex
        elif position == 3:
            self.neighbour3 = cityIndex
            
    def __repr__(self):
        return f"{self.cityName} at {self.lat}, {self.lng} with neighbours: {self.neighbour1}, {self.neighbour2}, {self.neighbour3}"

In [6]:
nodes = [CityNode(int(c.id), c.city, float(c.lat), float(c.lng), int(c.population), c.iso3) for _, c in df.iterrows()]

In [7]:
def updateMin(min1, min2, min3, dist: float, index: int):
    if (min1 is None) or (min2 is None) or (min3 is None):
        min1 = {"index": index, "dist": dist}
        min2 = {"index": index, "dist": dist}
        min3 = {"index": index, "dist": dist}
    elif dist <= min1["dist"]:
        min3 = min2
        min2 = min1
        min1 = {"index": index, "dist": dist}
    elif dist <= min2["dist"]:
        min3 = min2
        min2 = {"index": index, "dist": dist}
    elif dist <= min3["dist"]:
        min3 = {"index": index, "dist": dist}
    return (min1, min2, min3)

def calculateTime(start: CityNode, end: CityNode, position: int):
    if position == 1:
        time = 2
    elif position == 2:
        time = 4
    elif position == 3:
        time = 8
    if start.countryISO3 != end.countryISO3:
        time += 2
    if end.population > 200000:
        time += 2
    return time

In [23]:
start = time.time()
for i in range(len(nodes)):
    min1 = None
    min2 = None
    min3 = None
    for j in range(len(nodes)):
        if i == j:
            continue
        dist = geopy.distance(nodes[i].coordinates, nodes[j].coordinates)
        min1, min2, min3 = updateMin(min1, min2, min3, dist, j)
    nodes[i].updateNeighbour(min1["index"], 1)
    nodes[i].updateNeighbour(min2["index"], 2)
    nodes[i].updateNeighbour(min3["index"], 3)
    if (i+1) % 100 == 0:
        print(f"Execution: {i+1}\t\tTime elapsed (s): {time.time() - start}")

Execution: 100		Time elapsed (s): 47.35646438598633


KeyboardInterrupt: 

In [9]:
graph = Graph()
for index, city in enumerate(nodes):
    graph.add_edge(index, city.neighbour1, calculateTime(city, nodes[city.neighbour1], 1))
    graph.add_edge(index, city.neighbour2, calculateTime(city, nodes[city.neighbour2], 2))
    graph.add_edge(index, city.neighbour3, calculateTime(city, nodes[city.neighbour3], 3))

In [11]:
startNode = 284
dijkstra = DijkstraSPF(graph, startNode)
print(nodes[startNode].neighbour1, nodes[startNode].neighbour2, nodes[startNode].neighbour3)

print("%-5s %-5s" % ("label", "distance"))
for u in range(len(nodes)):
    if dijkstra.get_distance(u) != float('inf') :
        print("%-5s %8s" % (u, dijkstra.get_distance(u)))
dijkstra.get_path(167)

967 855 873
label distance
167         18
284          0
331         16
396         14
588         10
707         14
751         16
790         12
855          4
873          6
967          2


[284, 967, 873, 588, 790, 396, 167]

## Mi costruisco il grafo con un balltree

Guarda interactive maps per far vedere risultato se puoi utilizzare direttamente quelle con rise o simili

In [28]:
coords = [np.array([float(c.lat), float(c.lng)])*math.pi/180 for _, c in df.iterrows()]

In [29]:
tree = BallTree(coords, leaf_size=3, metric="haversine")
tree.query(coords[284:285], k=4)

(array([[0.        , 0.00351384, 0.00353995, 0.0073098 ]]),
 array([[284, 967, 855, 873]], dtype=int64))

In [53]:
def distanceTime(start, end, neighbour_pos: int):
    if neighbour_pos == 0:
        time = 2
    elif neighbour_pos == 1:
        time = 4
    elif neighbour_pos == 2:
        time = 8
    if start.iso3 != end.iso3:
        time += 2
    if float(end.population) > 200000:
        time += 2
    return time

In [54]:
graph = Graph()
for i in df.index:
    n = tree.query(coords[i:i+1], k=4)[1][0][1:]
    
    graph.add_edge(i, neighbours[0], distanceTime(df.iloc[i], df.iloc[n[0]], 0))
    graph.add_edge(i, neighbours[1], distanceTime(df.iloc[i], df.iloc[n[1]], 1))
    graph.add_edge(i, neighbours[2], distanceTime(df.iloc[i], df.iloc[n[2]], 2))

In [56]:
startNode = 189 #Città di partenza
dijkstra = DijkstraSPF(graph, startNode)

print("%-5s %-5s %-10s" % ("label", "distance", "path"))
for u in df.index:
    if dijkstra.get_distance(u) != float('inf') :
        print("%-5s %-8d %-10s" % (u, dijkstra.get_distance(u), dijkstra.get_path(u)))

label distance path      
34    2        [189, 34] 
189   0        [189]     
796   4        [189, 796]
887   10       [189, 887]
