In [1]:
from tools import *
import numpy as np, pandas as pd, networkx as nx
import matplotlib.pyplot as plt

In this example, we will construct a concise model of transit flows through airports in the United States.

In [2]:
# The raw data has been pre-processed to remove return trigrams of the form i -> j -> i. 
# data.txt contains trigram counts.

data = pd.read_csv('airport_trigrams.txt', sep=' ', header=None).to_numpy().tolist()

In [3]:
# For example

data[:2]

[['CHS', 'FLL', 'ORD', 4.0], ['CHS', 'MCO', 'ATL', 1.0]]

In [4]:
# The Network.from_paths constructor takes a list of paths. Each path is a list of physical nodes visited.
# Optionally, the last element of every path is the number of times the path is observed. This is indicated with the flag_n_paths parameter.

network = Network.from_paths(data, flag_n_paths=True)

In [5]:
# We identify the 10 largest transit hubs by counting the number of trigrams through each airport.
# The Network object stores the trigrams internally as a pandas DataFrame.

srs_top10 = network.df_trigrams.groupby('j')['num'].sum().sort_values(ascending=False).head(10)
srs_top10

j
ATL    529581.0
DFW    369169.0
DEN    345731.0
CLT    343534.0
ORD    235590.0
PHX    159261.0
LAS    145290.0
SEA    140914.0
IAH    116158.0
MSP     98626.0
Name: num, dtype: float64

In [6]:
nodes_to_process = srs_top10.index.tolist()

In [7]:
network.process_nodes(
    nodes_to_process=nodes_to_process,    # select which physical nodes to create states for, default 'all'
    flag_parallelise=True,                # parallelise with joblib, default False
    mu = None,                            # the strength of the prior. If None, it is estimated with leave-one-out-crossvalidation
    all_trigram_nodes=True,               # regularise all the physical nodes with trigrams through them even if not creating states. This alters their first-order model.
    overlap_threshold=0.7,                # threshold for flow overlap
    max_rank=10,                          # maximum rank to create state nodes until
    n_candidates=50,                      # number of candidate solutions for each rank
    initialise='kmeans'                   # initialising Convex-NMF
)

In [8]:
# The number of state nodes of each airport

{n: network.dict_physical_nodes[n].optimal_rank for n in nodes_to_process}

{'ATL': 2,
 'DFW': 2,
 'DEN': 2,
 'CLT': 2,
 'ORD': 2,
 'PHX': 4,
 'LAS': 5,
 'SEA': 2,
 'IAH': 2,
 'MSP': 3}

In [9]:
# Optionally, trim the neighbourhoods of the state nodes.

for node_idx in nodes_to_process:
    network.dict_physical_nodes[node_idx].trim(multiplier_ratio=0.05)

In [10]:
# Put the network together.

network.stitch()

In [11]:
# Make a NetworkX DiGraph

G = nx.DiGraph()
G.add_nodes_from(list(network.dict_state_nodes.keys()))
G.add_weighted_edges_from(
    [(u,v,w) for u, d in network.edges.items() for v,w in d.items()]
)

In [12]:
len(G.nodes), len(G.edges)

(451, 13746)