In [23]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import random

In [30]:
from auxiliaries import *

random.seed(20)

In [5]:
berlin, berlin_nodes = read_in_network("berlin", "combined")
G = convert_to_graph(berlin)
pos = add_positions(G, berlin_nodes)

## Connected component

The Berlin network is not a connected component, but as we see below, the largest connected component is very large (only 8 nodes are not connected to it). Therefore we can just drop those few nodes and work only with the largest connected component.

In [9]:
# length of each connected component
for component in nx.connected_components(G):
    print(len(component))

4593
4
2
2


In [14]:
connected_G = G.subgraph(max(nx.connected_components(G), key=len))

## Functions for experiments

In [19]:
def travel_time(a, b):
    """Rough estimate of travel time between two nodes."""
    return nx.shortest_path_length(G, a, b, weight="duration_avg")

In [32]:
def average_travel_time(G):
    """Average travel time between all pairs of nodes in G."""
    return nx.average_shortest_path_length(G, weight="duration_avg")


def full_average_travel_time(G):
    """
    calculate average travel time for each component of G
    and weight it by the number of nodes in the component
    """
    components = nx.connected_components(G)
    total = 0
    for component in components:
        total += len(component) * average_travel_time(G.subgraph(component))
    return total / len(G)


def random_sample(nodes, size):
    """
    Randomly sample a subset of nodes from the graph.
    """
    # take sample of nodes
    sample = random.sample(list(nodes), size)
    return sample


def sample_average_travel_time(sample, G):
    """
    Average travel time between all pairs of nodes in the sample.
    """
    # for each pair in sample, calculate shortest path length in G
    # and average over all pairs
    total = 0
    for a in sample:
        for b in sample:
            total += travel_time(a, b)
    return total / (len(sample) ** 2)

In [36]:
# only take largest connected component
sample_average_travel_time(random_sample(connected_G.nodes, 100), connected_G)

2263.021694259138