# Convert LN JSON graph data to Pandas DataFrame

In [1]:
import pandas as pd
import json
from pandas.io.json import json_normalize

def convert_ln_json_to_df(json_file_path):
    
    graph_path = open(json_file_path)
    graph_json = json.load(graph_path)
    
    df_nodes = json_normalize(graph_json['nodes'])
    
    df_channels = json_normalize(graph_json['edges'])
    df_channels.channel_id = df_channels.capacity.astype(int)
    df_channels.capacity = df_channels.capacity.astype(int)
    
    return df_nodes, df_channels

lngraph_path = 'lngraph.json'

df_nodes, df_channels = convert_ln_json_to_df(lngraph_path)

In [2]:
df_nodes.head()

Unnamed: 0,last_update,pub_key,alias,addresses,color
0,1573030306,0200072fd301cb4a680f26d87c28b705ccd6a1d5b00f1b...,OutaSpace 🚀,"[{'network': 'tcp', 'addr': '46.163.78.93:9760...",#123456
1,1569159977,02002f0d3dd8caf7271e17a815c862826dfe98aa33c261...,Iron_Mountain_Exchange,"[{'network': 'tcp', 'addr': 'okbwzrbgxiplzcbj6...",#000000
2,0,02004111b64ca4c268811f116112bf930b6f0fac452e28...,,[],#000000
3,1572938717,0200424bd89b5282c310e10a52fd783070556f947b54d9...,WHENBTC,"[{'network': 'tcp', 'addr': '67.166.1.116:9735'}]",#3399ff
4,1575112618,02004c625d622245606a1ea2c1c69cfb4516b703b47945...,WalletOfSatoshi.com,"[{'network': 'tcp', 'addr': '172.81.178.151:97...",#3399ff


In [3]:
df_channels.head()

Unnamed: 0,channel_id,chan_point,last_update,node1_pub,node2_pub,capacity,node1_policy,node2_policy,node2_policy.time_lock_delta,node2_policy.min_htlc,...,node2_policy.disabled,node2_policy.max_htlc_msat,node2_policy.last_update,node1_policy.time_lock_delta,node1_policy.min_htlc,node1_policy.fee_base_msat,node1_policy.fee_rate_milli_msat,node1_policy.disabled,node1_policy.max_htlc_msat,node1_policy.last_update
0,37200,ede04f9cfc1bb5373fd07d8af9c9b8b5a85cfe5e323b77...,0,03bd3466efd4a7306b539e2314e69efc6b1eaee29734fc...,03c3d14714b78f03fd6ea4997c2b540a4139258249ea1d...,37200,,,,,...,,,,,,,,,,
1,1000000,cfd0ae79fc150c2c3c4068ceca74bc26652bb269162437...,0,02eccebd9ed98f6d267080a58194dbe554a2b33d976eb9...,02ee4469f2b686d5d02422917ac199602ce4c366a7bfaa...,1000000,,,,,...,,,,,,,,,,
2,1000000,c0a8d3428f562c232d86be399eb4497934e7e0390fa79e...,0,02eccebd9ed98f6d267080a58194dbe554a2b33d976eb9...,02ee4469f2b686d5d02422917ac199602ce4c366a7bfaa...,1000000,,,,,...,,,,,,,,,,
3,200000,06bbac25ed610feb1d07316d1be8b8ba6850ee1dd96cc1...,0,03bd3466efd4a7306b539e2314e69efc6b1eaee29734fc...,03cbf298b068300be33f06c947b9d3f00a0f0e8089da32...,200000,,,,,...,,,,,,,,,,
4,2000000,2392c45431c064269e4eaeccb0476ac32e56485d84e104...,0,022e74ed3ddd3f590fd6492e60b20dcad7303f17e1ffd8...,02ee4469f2b686d5d02422917ac199602ce4c366a7bfaa...,2000000,,,,,...,,,,,,,,,,


# Convert LN JSON graph data to NetworkX graph

In [4]:
import networkx as nx
import json

def convert_ln_json_to_nx_graph(json_file_path):

    # Read JSON data
    graph_path = open(json_file_path)
    graph_json = json.load(graph_path)
    
    # Create an empty graph
    G = nx.Graph()
    
    # Parse and add nodes
    for node in graph_json['nodes']:
        G.add_node(
            node['pub_key'], 
            alias=node['alias'], 
            addresses=node['addresses'], 
            color=node['color'], 
            last_update=['last_update']
        )
        
    # Parse and add edges
    for edge in graph_json['edges']:
        G.add_edge(
            edge['node1_pub'],
            edge['node2_pub'],
            channel_id=edge['channel_id'],
            chan_point=edge['chan_point'],
            last_update=edge['last_update'],
            capacity=edge['capacity'],
            node1_policy=edge['node1_policy'],
            node2_policy=edge['node2_policy']
        )
        
    return G

lngraph_path = 'lngraph.json'
nxgraph = convert_ln_json_to_nx_graph(lngraph_path)

In [5]:
nx.number_of_nodes(nxgraph)

6251

In [6]:
nx.number_of_edges(nxgraph)

32079

# Convert LN JSON graph data to Graph-Tool graph

In [7]:
from graph_tool.all import *
import graph_tool as gt

def convert_ln_json_to_gt_graph(json_file_path, internal_properties=True, directed=False):

    # Read JSON data
    graph_path = open(json_file_path)
    graph_json = json.load(graph_path)
    
    # We start with an empty, directed graph
    g = gt.Graph(directed=directed)

    # Adding the node properties
    v_pub_key = g.new_vertex_property("string")
    v_last_update = g.new_vertex_property("int")
    v_alias = g.new_vertex_property("string")
    v_addresses = g.new_vertex_property("string")
    v_color = g.new_vertex_property("string")

    # Adding the edge properties
    e_channel_id = g.new_edge_property("object")
    e_chan_point = g.new_edge_property("object")
    e_last_update = g.new_edge_property("int")
    e_capacity = g.new_edge_property("object")
    e_node1_pub = g.new_edge_property("object")
    e_node2_pub = g.new_edge_property("object")
    e_node1_policy = g.new_edge_property("object")
    e_node2_policy = g.new_edge_property("object")

    # Create dictionary of pub_key:index pairs to keep track of vertices to generate edges
    v_indeces = {}
    
    # Let's now add the new vertices and edges
    for node in graph_json['nodes']:
        v = g.add_vertex()
        v_pub_key[v] = node['pub_key']
        v_alias[v] = node['alias']
        v_addresses[v] = node['addresses']
        v_color[v] = node['color']
        v_last_update[v] = node['last_update']
        v_indeces[node['pub_key']] = g.vertex_index[v]

    for edge in graph_json['edges']:
        v_index = v_indeces[edge['node1_pub']]
        target_index = v_indeces[edge['node2_pub']]
        
        v = g.vertex(v_index)
        target = g.vertex(target_index)
        e = g.add_edge(v, target)

        e_channel_id[e] = edge['channel_id']
        e_chan_point[e] = edge['chan_point']
        e_last_update[e] = edge['last_update']
        e_capacity[e] = edge['capacity']
        e_node1_pub[e] = edge['node1_pub']
        e_node2_pub[e] = edge['node2_pub']
        e_node1_policy[e] = edge['node1_policy']
        e_node2_policy[e] = edge['node2_policy']
                             
    # Making the vertex and edge properties internal (to be able to save them with the graph)
    if internal_properties:
        g.vertex_properties['pub_key'] = v_pub_key
        g.vertex_properties['alias'] = v_alias
        g.vertex_properties['addresses'] = v_addresses
        g.vertex_properties['color'] = v_color
        g.vertex_properties['last_update'] = v_last_update
                             
        g.edge_properties['channel_id'] = e_channel_id
        g.edge_properties['chan_point'] = e_chan_point
        g.edge_properties['last_update'] = e_last_update
        g.edge_properties['node1_pub'] = e_node1_pub
        g.edge_properties['node2_pub'] = e_node2_pub
        g.edge_properties['node1_policy'] = e_node1_policy
        g.edge_properties['node2_policy'] = e_node2_policy
                             
    return g

lngraph_path = 'lngraph.json'
gtgraph = convert_ln_json_to_gt_graph(lngraph_path)

In [8]:
gtgraph

<Graph object, undirected, with 6251 vertices and 35457 edges at 0x163762a90>

# Function to compute the average and quartile values of a metric

In [9]:
import numpy as np

def get_basic_stats(values, column_label):
    
    average = values.mean()
    percentiles = np.percentile(values, [90, 50, 10])

    print('Statistics for {}: '.format(column_label))
    print('Average: {} \nPercentiles: \n 90th Percentile: {} \n 50th Percentile: {} \n 10th Percentile: {} \n'.format(
        average, percentiles[0], percentiles[1], percentiles[2]
    ))
    
    return average, percentiles

# Nodes (dataframe)

* Number of nodes, with/without channels
* Channels per node
* Capacity per node

In [10]:
def add_node_chan_info(df_nodes, df_channels):
    df_nodes = pd.concat([
        df_nodes,
        pd.DataFrame(columns=[
                'num_enabled_channels',
                'num_channels',
                'percent_enabled_chan',
                'total_node_capacity'
        ])
    ], sort=False)

    for index, node in df_nodes.iterrows():

        pub_key = node['pub_key']
        node_channels = df_channels[df_channels.node1_pub == pub_key]

        enabled_channels = 0
        total_capacity = 0

        for _, channel in node_channels.iterrows():
            total_capacity += channel.capacity
            disabled = channel.loc['node1_policy.disabled']

            if disabled is not None and not disabled:
                enabled_channels += 1

        df_nodes.loc[index, 'num_enabled_channels'] = enabled_channels
        df_nodes.loc[index, 'num_channels'] = node_channels.shape[0]
        if node_channels.shape[0] > 0: df_nodes.loc[index, 'percent_enabled_chan'] = enabled_channels/node_channels.shape[0]
        df_nodes.loc[index, 'total_node_capacity'] = total_capacity

    return df_nodes

In [11]:
df_nodes = add_node_chan_info(df_nodes, df_channels)
df_nodes.head()

Unnamed: 0,last_update,pub_key,alias,addresses,color,num_enabled_channels,num_channels,percent_enabled_chan,total_node_capacity
0,1573030000.0,0200072fd301cb4a680f26d87c28b705ccd6a1d5b00f1b...,OutaSpace 🚀,"[{'network': 'tcp', 'addr': '46.163.78.93:9760...",#123456,0,9,0.0,6600000
1,1569160000.0,02002f0d3dd8caf7271e17a815c862826dfe98aa33c261...,Iron_Mountain_Exchange,"[{'network': 'tcp', 'addr': 'okbwzrbgxiplzcbj6...",#000000,1,1,1.0,8000000
2,0.0,02004111b64ca4c268811f116112bf930b6f0fac452e28...,,[],#000000,0,1,0.0,50000
3,1572939000.0,0200424bd89b5282c310e10a52fd783070556f947b54d9...,WHENBTC,"[{'network': 'tcp', 'addr': '67.166.1.116:9735'}]",#3399ff,18,20,0.9,40678240
4,1575113000.0,02004c625d622245606a1ea2c1c69cfb4516b703b47945...,WalletOfSatoshi.com,"[{'network': 'tcp', 'addr': '172.81.178.151:97...",#3399ff,164,214,0.766355,954797898


In [12]:
# Number of nodes with/without channels
cnt_nodes_with_channels = df_nodes[df_nodes.num_channels != 0].shape[0]
cnt_nodes_without_channels = df_nodes[df_nodes.num_channels == 0].shape[0]
total_cnt_nodes = cnt_nodes_with_channels+cnt_nodes_without_channels

print('Number of nodes {} \n with channels: {} \n without channels {} \n'
      .format(cnt_nodes_with_channels, cnt_nodes_without_channels, total_cnt_nodes)
    )

# Statistics for channels per node
# Including inactive nodes (nodes with no channels)
values = df_nodes.num_channels.values
average, percentiles = get_basic_stats(values, 'node channels (with inactive)')

# Without inactive nodes
values = df_nodes.loc[df_nodes.num_enabled_channels != 0, 'num_channels'].values
average, percentiles = get_basic_stats(values, 'node channels (without inactive)')

# Statistics for capacity per node
# Including inactive nodes (nodes with no channels)
values = df_nodes.total_node_capacity.values
average, percentiles = get_basic_stats(values, 'node capacities (with inactive)')

# Without inactive nodes
values = df_nodes.loc[df_nodes.num_enabled_channels != 0, 'total_node_capacity'].values
average, percentiles = get_basic_stats(values, 'node capacities (without inactive)')

Number of nodes 4500 
 with channels: 1751 
 without channels 6251 

Statistics for node channels (with inactive): 
Average: 5.672212446008639 
Percentiles: 
 90th Percentile: 10.0 
 50th Percentile: 1.0 
 10th Percentile: 0.0 

Statistics for node channels (without inactive): 
Average: 10.12246835443038 
Percentiles: 
 90th Percentile: 18.0 
 50th Percentile: 3.0 
 10th Percentile: 1.0 

Statistics for node capacities (with inactive): 
Average: 13359987.889137737 
Percentiles: 
 90th Percentile: 15329369.0 
 50th Percentile: 200000.0 
 10th Percentile: 0.0 

Statistics for node capacities (without inactive): 
Average: 26064984.527848102 
Percentiles: 
 90th Percentile: 34688527.3 
 50th Percentile: 1926551.0 
 10th Percentile: 100000.0 



# Channels

In [13]:
total_num_channels = df_channels.shape[0]

unique_channels = df_channels.drop_duplicates(subset=['node1_pub', 'node2_pub'])
num_unique_channels = unique_channels.shape[0]
num_duplicate_channels = total_num_channels - unique_channels.shape[0]

print('Total number of channels: {} \n unique: {} \n duplicate: {}'.format(total_num_channels, num_unique_channels, num_duplicate_channels))

Total number of channels: 35457 
 unique: 32079 
 duplicate: 3378


# Network Capacity

In [14]:
capacity = df_channels.capacity.sum()
print(capacity/100000000)

835.13284295


# Capacity Per Channel

In [15]:
values = df_channels.capacity.values
average, percentiles = get_basic_stats(values, 'capacity per channel')

Statistics for capacity per channel: 
Average: 2355339.828383676 
Percentiles: 
 90th Percentile: 8000000.0 
 50th Percentile: 500000.0 
 10th Percentile: 27207.4 



# Distance Measures

In [16]:
# Keep largest connected component
l = gt.topology.label_largest_component(gtgraph, directed=False)
gtgraph.set_vertex_filter(l)
print('Number of vertices: {} \n Number of edges: {} \n'.format(gtgraph.num_vertices(), gtgraph.num_edges()))

# Remove self-loops (ie: duplicate edges)
gt.stats.remove_parallel_edges(gtgraph)
print('Number of vertices: {} \n Number of edges: {} \n'.format(gtgraph.num_vertices(), gtgraph.num_edges()))

Number of vertices: 6176 
 Number of edges: 35415 

Number of vertices: 6176 
 Number of edges: 32038 



In [17]:
def get_distance_measures(graph, directed=False, pseudo_diameter=False, return_dist=False):

    # Compute shortest paths from all vertices to all vertices and save results in a dataframe
    dist_map = shortest_distance(graph, directed)
    shortest_paths = pd.DataFrame(dist_map)

    # Average distance
    average = shortest_paths.replace(0, np.nan).mean(skipna=True).mean()
    print('Average shortest distance: {}'.format(round(average, 2)))

    # Diameter of the graph: length of the longest shortest path in the graph
    if pseudo_diameter:
        diameter, _ = gt.topology.pseudo_diameter(graph)
        print('Diameter: {}'.format(pseudo_diameter))
    else: 
        diameter = shortest_paths.values.max()
        print('Pseudo-diameter: {}'.format(diameter))

    # Radius of the graph: smallest of largest shortest path of
    radius = shortest_paths.replace(0, np.nan).max(skipna=True).min()
    print('Radius: {}'.format(radius))

    if return_dist:
        return shortest_paths, average, diameter, radius

    return average, diameter, radius

In [18]:
get_distance_measures(gtgraph)

Average shortest distance: 3.22
Pseudo-diameter: 8
Radius: 8.0


(3.223157894736842, 8, 8.0)

# Completeness Measures

In [19]:
# Completeness measure: density of the graph
# The max number of channels (or edges) in the network is n*(n-1)/2, where n is the number of nodes (or vertices).
max_num_channels = (total_cnt_nodes*(total_cnt_nodes-1))/2
completeness = num_unique_channels/max_num_channels
print(completeness)

0.0016421820508718606


# Clustering Measures

In [20]:
# Transitivity is the ratio of potential triangles present.
# A value of 1 means every path of length 2 loops back into a triangle.
transitivity, sd = gt.clustering.global_clustering(gtgraph)
print('Graph transitivity: {}\n'.format(round(transitivity,3)))

# Clustering coefficient is the ratio of interconnections between a node's peers. 
# A value of 0 means the node is a hub, and none of its peers are connected. A value of 1 means the node forms a clique with its peers.
transitivities = gt.clustering.local_clustering(gtgraph).a
get_basic_stats(transitivities, 'node transitivities')

Graph transitivity: 0.067

Statistics for node transitivities: 
Average: 0.22932388757423353 
Percentiles: 
 90th Percentile: 0.6666666666666666 
 50th Percentile: 0.1 
 10th Percentile: 0.0 



(PropertyArray(0.22932389), array([0.66666667, 0.1       , 0.        ]))

# Connectivity Measures

In [30]:
# Percent of cut vertices
_, art, _ = gt.topology.label_biconnected_components(gtgraph)
cnt_cut_vertices = art.a.sum()
percent_cut_vertices = art.a.sum()/gtgraph.num_edges()
print('Percent of cut vertices: {}'.format(round(float(percent_cut_vertices),3)))

# Percent of cut edges
bridges = list(nx.bridges(nxgraph))
perc_cut_edges = len(bridges)/gtgraph.num_edges()
print('Percent of cut edges: {}'.format(round(perc_cut_edges,2)))

Percent of cut vertices: 0.018
Percent of cut edges: 0.06
