In [None]:
import os
backend = 'pytorch'
os.environ['DGLBACKEND'] = backend

In [None]:
import torch
# import tensorflow as tf
import dgl
import networkx as nx
import tqdm.auto as tqdm
import numpy as np
import itertools
import pickle
import pathlib
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap

# Compute features

In [None]:
def greedy_tour(g, depot, weight='weight'):
    tour = [depot]
    while len(tour) < len(g.nodes):
        i = tour[-1]
        neighbours = [(j, g.edges[(i, j)]['weight']) for j in g.neighbors(i) if j not in tour]
        j, dist = min(neighbours, key=lambda e: e[1])
        tour.append(j)

    tour.append(depot)
    return tour

def set_greedy_tour(g, depot):
    tour = greedy_tour(g, depot)
    tour_edges = zip(tour[:-1], tour[1:])

    nx.set_edge_attributes(g, False, 'in_greedy_solution')
    for e in tour_edges:
        g.edges[e]['in_greedy_solution'] = True
        

In [None]:
def set_neighbour_features(g, nn_levels, min_degree):
    # calculate knn for each edge
    for e in g.edges:
        g.edges[e]['neighbour'] = {}

    for i in g.nodes:
        neighbours = [(j, g.edges[(i, j)]['weight']) for j in g.neighbors(i)]
        nearest_neighbours = sorted(neighbours, key=lambda e: e[1])
        for k, (j, _) in enumerate(nearest_neighbours):
            g.edges[(i, j)]['neighbour'][i] = k
            
    # knn graphs and nn clique
    nx.set_edge_attributes(g, False, 'nn_clique')
    for i, level in enumerate(nn_levels):
        nx.set_edge_attributes(g, False, f'{i}_nn')

    for e in g.edges:
        i, j = e
        neighbours = g.edges[e]['neighbour']
        if neighbours[i] == neighbours[j]:
            g.edges[e]['nn_clique'] = True

        for level_i, level in enumerate(nn_levels):
            g.edges[(i, j)][f'{level_i}_nn'] = (neighbours[i] <= level) or (neighbours[j] <= level)
            
    # erode longest edges until min degree reached
    edges = sorted([(e, g.edges[e]['weight']) for e in g.edges], key=lambda e: e[1], reverse=True)
    edges, _ = map(list, zip(*edges))

    h = g.edge_subgraph(edges)
    while min(dict(nx.degree(h)).values()) > min_degree:
        edges.pop(0)
        h = g.edge_subgraph(edges)

    nx.set_edge_attributes(g, False, 'md_nn')
    for e in edges:
        g.edges[e]['md_nn'] = True


In [None]:
def set_depot_weight(g, depot):
    for n in g.nodes:
        if n == depot:
            g.nodes[n]['depot_weight'] = 0
        else:
            g.nodes[n]['depot_weight'] = g.edges[(depot, n)]['weight']
            

In [None]:
def set_fancy_graph_features(g):
    cf_bc = nx.edge_current_flow_betweenness_centrality(g, weight='weight')
    sp_bc = nx.edge_betweenness_centrality(g, weight='weight')

    sp_cc = nx.closeness_centrality(g, distance='weight')
    cf_cc = nx.current_flow_closeness_centrality(g, weight='weight')
    cl = nx.clustering(g, weight='weight')

    nx.set_edge_attributes(g, sp_bc, 'sp_betweenness')
    nx.set_edge_attributes(g, cf_bc, 'cf_betweenness')
    nx.set_node_attributes(g, sp_cc, 'sp_closeness')
    nx.set_node_attributes(g, cf_cc, 'cf_closeness')
    nx.set_node_attributes(g, cl, 'clustering')
    

# Prepare dataset

In [None]:
data_dir = pathlib.Path('/local/scratch/bh511/data')
data_file = data_dir / 'tsp10_concorde_1000000.txt'

n_instances = 150000
out_dir = data_dir / f'{n_instances}_instances'
out_dir.mkdir(exist_ok=True)

In [None]:
def parse_line(line):
    problem_str, solution_str = line.split('output')

    node_data = iter(problem_str.strip().split(' '))
    node_counter = 0
    nodes = {}
    for x in node_data:
        y = next(node_data)
        pos = np.array([float(x), float(y)])
        nodes[node_counter] = pos
        node_counter += 1

    solution = [int(x) - 1 for x in solution_str.strip().split(' ')]
    solution_edges = [e for e in zip(solution[:-1], solution[1:])]
    
    G = nx.Graph()
    for i in nodes:
        G.add_node(i, pos=nodes[i])
    for i, j in itertools.combinations(nodes, 2):
        w = np.linalg.norm(nodes[i] - nodes[j])
        in_solution = (i, j) in solution_edges or (j, i) in solution_edges
        G.add_edge(i, j, weight=w, in_solution=in_solution)
        
    return G

In [None]:
with open(data_file) as data:
    for instance_i in tqdm.trange(n_instances): 
        g = parse_line(next(data))
        
        depot = next(iter(g.nodes))
        set_greedy_tour(g, depot)
        set_depot_weight(g, depot)

        nn_levels = [int(x*len(g.nodes)) for x in [0.1, 0.2, 0.3]]
        min_degree = 2
        set_neighbour_features(g, nn_levels, min_degree)

        set_fancy_graph_features(g)

        # create new graph with just features/labels (and pos for plotting)
        h = g.__class__()
        h.add_nodes_from(g)
        h.add_edges_from(g.edges)
        nx.set_node_attributes(h, nx.get_node_attributes(g, 'pos'), 'pos')

        for e in h.edges:
            i, j = e
            features = np.array([
                g.edges[e]['weight'],
                g.edges[e]['in_greedy_solution'],
                g.edges[e]['neighbour'][i],
                g.edges[e]['neighbour'][j],
                g.edges[e]['nn_clique'],
                g.edges[e]['0_nn'],
                g.edges[e]['1_nn'],
                g.edges[e]['2_nn'],
                g.edges[e]['md_nn'],
                g.edges[e]['sp_betweenness'], # the same for every node
                g.edges[e]['cf_betweenness'],
                g.nodes[i]['depot_weight'],
                g.nodes[j]['depot_weight'],
                g.nodes[i]['sp_closeness'],
                g.nodes[j]['sp_closeness'],
                g.nodes[i]['cf_closeness'],
                g.nodes[j]['cf_closeness'],
                g.nodes[i]['clustering'],
                g.nodes[j]['clustering'],
            ], dtype=np.float32)
            label = np.array([
                g.edges[e]['in_solution'],
            ], dtype=np.int64)

            h.edges[e]['x'] = features
            h.edges[e]['y'] = label
            
        nx.write_gpickle(h, out_dir / f'i{instance_i}.pkl')

# Split into train, validation, test sets

In [None]:
data_set = list(out_dir.glob('i[0-9]*.pkl'))

train_set, test_set = train_test_split(data_set, train_size=0.8, shuffle=True)
train_set, val_set = train_test_split(train_set, train_size=0.8, shuffle=True)

for data_set, file in zip([train_set, val_set, test_set], ['train.txt', 'val.txt', 'test.txt']):
    with open(out_dir / file, 'w') as data_file:
        for path in data_set:
            data_file.write(str(path) + '\n')

In [None]:
scaler = MinMaxScaler()

for instance_path in tqdm.tqdm(train_set):
    g = nx.read_gpickle(instance_path)
    features = np.vstack(list(g.edges[e]['x'] for e in g.edges))
    scaler.partial_fit(features)
    
pickle.dump(scaler, open(out_dir / 'scaler.pkl', 'wb'))

In [None]:
def prepare_graphs(instances, scaler):
    graphs = []
    for instance in tqdm.tqdm(instances):
        g = nx.read_gpickle(instance)
        lg = nx.line_graph(g)
        
        features = {e: scaler.transform(g.edges[e]['x'][np.newaxis, :]).squeeze() for e in lg.nodes}
        labels = {e: g.edges[e]['y'] for e in lg.nodes}
        nx.set_node_attributes(lg, features, 'x')
        nx.set_node_attributes(lg, labels, 'y')

        h = dgl.from_networkx(lg, node_attrs=['x', 'y'])
        graphs.append(h)
    
    return graphs

train_graphs = prepare_graphs(train_set, scaler)
dgl.save_graphs(str(out_dir / 'train_graphs.bin'), train_graphs)

val_graphs = prepare_graphs(val_set, scaler)
dgl.save_graphs(str(out_dir / 'val_graphs.bin'), val_graphs)

# Visualise dataset

In [None]:
# cmap_colors = np.zeros((100, 4))
# cmap_colors[:, 0] = 1
# cmap_colors[:, 3] = np.linspace(0, 1, 100)
# cmap = ListedColormap(cmap_colors)

In [None]:
# fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# for i, (ax, g) in enumerate(zip(axes, nx_graphs[:3])):
#     pos = {n: (g.nodes[n]['x'], g.nodes[n]['y']) for n in g.nodes}
#     in_solution = nx.get_edge_attributes(g, 'in_solution')

#     nx.draw(g, pos, edge_color=in_solution.values(), ax=ax, edge_cmap=cmap, edge_vmax=1, edge_vmin=0)
#     ax.set_title(f'Instance {i}')

In [None]:
# fig, ax = plt.subplots(1, 1, figsize=(5, 5))

# g = nx_graphs[0]

# feature_name = 'in_solution'
# pos = {n: (g.nodes[n]['x'], g.nodes[n]['y']) for n in g.nodes}
# feature = nx.get_edge_attributes(g, feature_name)

# nx.draw(g, pos, edge_color=feature.values(), ax=ax, edge_cmap=cmap)
# ax.set_title(feature_name)