- **Node Analysis**: The code calculates key metrics for nodes, including:
  - Total and average capacity of nodes.
  - Percentiles and ranks based on node capacity to understand distribution.
  - Node channel count with percentiles and ranking to evaluate connectivity.

- **Edge Analysis**: For edges, the code facilitates the extraction of:
  - Connectivity patterns between nodes, identifying key connection points.
  - Edge capacity utilization metrics to gauge traffic distribution.
  - Ranking of edges based on their capacity, highlighting critical connections in the network.

- **Network Optimization**: Insights from node and edge metrics are crucial for optimizing network routes and load balancing, enhancing overall network efficiency.

- **Strategic Planning**: Analyzing percentile and rank for node and edge capacities assists in strategic network planning and scaling decisions.

- **Structural Insights**: The code helps in understanding the structural dynamics of the network, such as centrality measures, which can indicate the most influential or vulnerable points in the network.


In [168]:
import json
import csv
import datetime
import os
import pandas as pd
import hashlib
import hvplot.pandas
import holoviews as hv
hv.extension('bokeh')

import requests
import time

import numpy as np

import pickle

current_date = datetime.datetime.today().strftime('%Y%m%d')
current_datehr = datetime.datetime.today().strftime('%Y%m%d%H')

In [6]:
nodes = pd.read_parquet('nodes.parquet')
edges = pd.read_parquet('edges.parquet')

In [72]:
edges.tail(1)


In [10]:
nodes.columns

Index(['last_update', 'pub_key', 'alias', 'addresses', 'color', 'features',
       'custom_records', 'node_key', 'source', 'channel_count',
       'total_capacity'],
      dtype='object')

In [12]:
nodes.drop(['channel_count', 'total_capacity'], axis=1, inplace=True)
nodes.columns

Index(['last_update', 'pub_key', 'alias', 'addresses', 'color', 'features',
       'custom_records', 'node_key', 'source'],
      dtype='object')

In [20]:

def process_channel_capacity(capacity):
    """
    Process and categorize the channel capacity into different segments for display
    and analysis. The capacity is formatted based on its size and categorized into
    predefined ranges.

    Args:
    capacity (int): The channel capacity in satoshis.

    Returns:
    tuple: A tuple containing formatted capacity and its category.
    """
    # Define bins and labels for general categorization
    general_bins = [0, 5000001, 100000001, 10000000000]
    general_labels = ['My Way', 'Highway', 'Freeway']
    Channel_Size_Tier = pd.cut([capacity], bins=general_bins, labels=general_labels, right=False)[0]

    # Detailed capacity categories
    detailed_bins = [0, 50000, 100000, 500000, 1000000, 3000000, 5000000, 10000000, 
                     25000000, 50000000, 100000000, 1000000000, 10000000000]
    detailed_labels = ['0-50k', '50k-100k', '100k-500k', '500k-1M', '1M-3M', '3M-5M', 
                       '5M-10M', '10M-25M', '25M-50M', '0.5BTC-1BTC', '1BTC-10BTC', '10BTC-100BTC']
    Channel_Size_Range = pd.cut([capacity], bins=detailed_bins, labels=detailed_labels, right=False)[0]

    # Format capacity for display
    formatted_capacity = format_capacity(capacity)

    return formatted_capacity, Channel_Size_Tier, Channel_Size_Range



def process_node_data(source, edges):
    """
    Returns detailed statistics for network nodes including channel counts, 
    capacity metrics, and capacity categories.
    
    Args:
    source (str): Node's public key identifier.
    edges (DataFrame): DataFrame containing edge data.
    
    Returns:
    tuple: Node statistics including total channel count, capacity categories, 
           formatted total capacity, and capacity metrics (average, median, mode, minimum, maximum).
    """
    # Apply the consistent filtering condition across all operations
    filtered_edges = edges[
        ((edges["source"] == source) | (edges["target"] == source)) & 
        (edges["capacity"] > 0) & 
        (edges["last_update"] > 0)
    ]
    
    if filtered_edges.empty:
        return (0, {}, "0 sats", 0, 0, 0, 0, 0)  # Return default values if no edges match

    # Calculate total channels and categorize them
        
    total_channels = len(filtered_edges['channel_id'].unique())
    category_counts = filtered_edges['Channel_Size_Tier'].value_counts().reindex(['My Way', 'Highway', 'Freeway'], fill_value=0).to_dict()

    # Extract and compute capacity related metrics
    total_capacity = filtered_edges["capacity"].sum()
    avg_chnl_size = filtered_edges["capacity"].mean()
    med_chnl_size = filtered_edges["capacity"].median()
    mode_chnl_size = filtered_edges["capacity"].mode().iloc[0] if not filtered_edges["capacity"].mode().empty else 0
    min_chnl_size = filtered_edges["capacity"].min()
    max_chnl_size = filtered_edges["capacity"].max()

    # Format total capacity
    formatted_total_capacity = format_capacity(total_capacity)

    return (total_channels, category_counts, total_capacity, formatted_total_capacity, avg_chnl_size, med_chnl_size, mode_chnl_size, min_chnl_size, max_chnl_size)

def format_capacity(capacity):
    """Format capacity into a readable string."""
    if capacity < 1000000:
        return f"{capacity/1000:,.0f}k sats"
    elif capacity < 100000000:
        return f"{capacity/1000000:,.1f}m sats"
    else:
        return f"{capacity/100000000:,.1f} bitcoin"


def assign_percentile(rank, total):
    """Assigns a custom percentile label based on the rank and total count."""
    percentile = rank / total * 100  # Calculate percentile based directly on rank
    if percentile <= 0.5:
        return 'Top 0.5%'
    elif percentile <= 5:
        return 'Top 5%'
    elif percentile <= 20:
        return 'Top 20%'
    elif percentile <= 40:
        return 'Top 40%'
    else:
        return 'Bottom 60%'

def rank_and_categorize_nodes(nodes_df):
    # Filter nodes where both capacity and channel count are greater than zero
    active_nodes = nodes_df[(nodes_df['Total_Capacity'] > 0) & (nodes_df['Total_Channels'] > 0)].copy()

    # Rank nodes based on total capacity and channel count within the filtered data
    active_nodes['Capacity_Rank'] = active_nodes['Total_Capacity'].rank(method='max', ascending=False)
    active_nodes['Channel_Count_Rank'] = active_nodes['Total_Channels'].rank(method='max', ascending=False)

    # Calculate total nodes for percentile calculations
    total_nodes = len(active_nodes)

    # Apply custom percentile function
    active_nodes['Capacity_Percentile'] = active_nodes['Capacity_Rank'].apply(lambda x: assign_percentile(x, total_nodes))

    # Merge the calculated ranks and percentiles back into the original DataFrame
    nodes_df = nodes_df.merge(active_nodes[['Capacity_Rank', 'Channel_Count_Rank', 'Capacity_Percentile']],
                              left_index=True, right_index=True, how='left')

    return nodes_df




def clearnet_tor_categorizer(row):
    clearnet_status = 'Y' if pd.notna(row['address_1']) else 'N'
    tor_status = 'Y' if pd.notna(row['address_2']) else 'N'

    if clearnet_status == 'Y' and tor_status == 'Y':
        return 'Clearnet and Tor'
    elif clearnet_status == 'Y':
        return 'Only Clearnet'
    else:
        return 'Only Tor'




In [40]:
print(nodes.columns)
print(edges.columns)
# print(nodes.dtypes)

Index(['last_update', 'pub_key', 'alias', 'addresses', 'color', 'features',
       'custom_records', 'node_key', 'source', 'Total_Channels',
       'Category_Counts', 'Total_Capacity', 'Formatted_Total_Capacity',
       'Avg_Channel_Size', 'Median_Channel_Size', 'Mode_Channel_Size',
       'Min_Channel_Size', 'Max_Channel_Size', 'Capacity_Rank',
       'Channel_Count_Rank', 'Capacity_Percentile'],
      dtype='object')
Index(['channel_id', 'chan_point', 'last_update', 'node1_pub', 'node2_pub',
       'capacity', 'node1_policy', 'node2_policy', 'custom_records',
       'source_key', 'target_key', 'source', 'target', 'Formatted_Capacity',
       'Channel_Size_Tier', 'Channel_Size_Range'],
      dtype='object')


In [22]:
nodes['Total_Capacity'] = pd.to_numeric(nodes['Total_Capacity'], errors='coerce')
nodes['Total_Channels'] = pd.to_numeric(nodes['Total_Channels'], errors='coerce')


# Apply the function to the 'capacity' column and split the results into three new columns
edges[['Formatted_Capacity', 'Channel_Size_Tier', 'Channel_Size_Range']] = edges['capacity'].apply(
    lambda x: pd.Series(process_channel_capacity(x)))

# Apply the function to 'nodes' and expand the results into multiple new columns
results = nodes['node_key'].apply(lambda x: process_node_data(x, edges))
nodes[['Total_Channels', 'Category_Counts','Total_Capacity', 'Formatted_Total_Capacity', 'Avg_Channel_Size', 'Median_Channel_Size', 'Mode_Channel_Size', 'Min_Channel_Size', 'Max_Channel_Size']] = pd.DataFrame(results.tolist(), index=nodes.index)

nodes = rank_and_categorize_nodes(nodes)


In [114]:
# Now let's view the results to ensure it worked as expected
# nodes.drop(['Capacity_Rank', 'Channel_Count_Rank', 'Capacity_Percentile'], axis=1, inplace=True)
# nodes[(nodes['Total_Capacity'] > 0) & (nodes['Total_Channels'] > 0) & (nodes['last_update'] > 0) & (nodes['Capacity_Rank'] < 5)].head(1)

print(edges[['Formatted_Capacity', 'Channel_Size_Tier', 'Channel_Size_Range']].tail(3))

print(nodes[['Total_Capacity', 'Capacity_Rank', 'Capacity_Percentile']].tail(3))

      Formatted_Capacity Channel_Size_Tier Channel_Size_Range
70898          500k sats            My Way            500k-1M
70899         16.8m sats           Highway            10M-25M
70900         16.8m sats           Highway            10M-25M
       Total_Capacity  Capacity_Rank Capacity_Percentile
21810       1020000.0         6504.0          Bottom 60%
21811             NaN            NaN                 NaN
21812             NaN            NaN                 NaN


#we will go with the 2nd defintion. Last update column  for nodes are derived from channel messages that nodes send to each other, so we w'll rely more on channel data than node. Any way, in our definition of capacity and channel, we made sure than channel last update is non zero.



In [124]:
print(len(nodes[nodes['last_update'] > 0]))
print(len(nodes[(nodes['Total_Capacity'] > 0) & (nodes['Total_Channels'] > 0)]))
print(len(nodes[(nodes['Total_Capacity'] > 0) & (nodes['Total_Channels'] > 0) & (nodes['last_update'] > 0)]))
print(len(edges))
print(len(edges[(edges["capacity"] > 0) & (edges["last_update"] > 0)]))

13472
13030
12581
70901
50340


In [126]:
# Filter the DataFrame
filtered_nodes = nodes[(nodes['Total_Capacity'] > 0) & (nodes['Total_Channels'] > 0)]
filtered_edges = edges[(edges["capacity"] > 0) & (edges["last_update"] > 0)]

# Calculate total sums for normalization
total_capacity = filtered_nodes['Total_Capacity'].sum()
total_channels = filtered_nodes['Total_Channels'].sum()
total_nodes = len(filtered_nodes)

capacity_edges = filtered_edges['capacity'].sum()
total_channel_edges = len(filtered_edges)


In [134]:
print(f"Total Nodes: {total_nodes}")
print(f"Total Channels (Node File): {total_channels}")
print(f"Total Channels (node file /2): {total_channels/2}")
print(f"Total Channel Edges: {total_channel_edges}")
print(f"Total Capacity (node file): {total_capacity}")
print(f"Total Capacity (node file /2): {total_capacity/2}")
print(f"Capacity Edges: {capacity_edges}")


Total Nodes: 13030
Total Channels (Node File): 100682
Total Channels (node file /2): 50341.0
Total Channel Edges: 50340
Total Capacity (node file): 1060790227840.0
Total Capacity (node file /2): 530395113920.0
Capacity Edges: 530393413920


In [136]:
filtered_edges.columns

Index(['channel_id', 'chan_point', 'last_update', 'node1_pub', 'node2_pub',
       'capacity', 'node1_policy', 'node2_policy', 'custom_records',
       'source_key', 'target_key', 'source', 'target', 'Formatted_Capacity',
       'Channel_Size_Tier', 'Channel_Size_Range'],
      dtype='object')

In [162]:
from holoviews import opts
# Aggregate edge data by 'Channel_Size_Tier'
aggregated_data = filtered_edges.groupby('Channel_Size_Tier').agg(
    Total_Capacity=('capacity', 'sum'),  # Summing up the capacities
    Edge_Count=('channel_id', 'nunique')  # Counting unique channel IDs
).reset_index()


# Calculate total sums for percentages
total_capacity = aggregated_data['Total_Capacity'].sum()
total_edges = aggregated_data['Edge_Count'].sum()


# Calculate percentages
aggregated_data['% Total Capacity'] = (aggregated_data['Total_Capacity'] / total_capacity) * 100
aggregated_data['% Total Channels'] = (aggregated_data['Edge_Count'] / total_edges) * 100


aggregated_data.rename(columns={
    '% Total Capacity': 'Capacity',
    '% Total Channels': 'Channels'
}, inplace=True)
aggregated_data.reset_index()

In [226]:
## import warnings
warnings.filterwarnings('ignore', category=Warning)


# Create the plot
plot_bychnltier = aggregated_data.hvplot.bar(
    x='Channel_Size_Tier',
    y=['Capacity', 'Channels'],
    stacked=False,
    height=400,
    width=600,
    xlabel='Channel Size Tier',
    ylabel="Percentage",
    title="1.2% of Channels( freeways) contribute to 32% of capacity of LN",
    cmap="bmy",
    hover=False,
    grid=True
   )

# Display the plot
plot_bychnltier

In [186]:
aggregated_data.dtypes

Channel_Size_Tier     object
Total_Capacity         int64
Edge_Count             int64
Capacity             float64
Channels             float64
dtype: object

In [228]:
filtered_nodes.to_parquet('nodes_chap2.parquet')
filtered_edges.to_parquet('edges_chap2.parquet')
