In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os, sys
from ln_utils import *

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

# Parameters

In [None]:
from datawand.parametrization import ParamHelper
ph = ParamHelper("../", "LNGraph", sys.argv)

In [None]:
data_dir = ph.get("data_dir")
stat_dir = "%s/centrality_scores/" % data_dir
print(stat_dir)

# Load data

## a.) Graph data

In [None]:
edges = pd.read_csv("%s/directed_temporal_edges.csv" % data_dir)

## b.) Precomputed centrality scores

In [None]:
snapshot_ids = range(8)#range(9)

In [None]:
weight_cols = [None, "capacity", "fee_base_msat", "fee_rate_milli_msat"]

In [None]:
centrality_scores = load_centrality_scores(stat_dir, snapshot_ids, weight_cols, drop_cols=["deg"])

In [None]:
def calculate_ranks(scores, snapshot_ids, weight_cols):
    ranks = {}
    for w in weight_cols:
        ranks[w] = []
        for i in snapshot_ids:
            ranks[w].append(scores[w][i].set_index("index").rank(ascending=False).reset_index().rename({"index":"node_pub"},axis=1))
    return ranks

In [None]:
centrality_ranks = calculate_ranks(centrality_scores, snapshot_ids, weight_cols)

# Calculate centrality ranks of popular nodes

   * **popular nodes** are those (already seen nodes) that new nodes connect to most frequently

In [None]:
edges.head()

In [None]:
snapshot_graphs, snapshot_edges = get_snapshots(edges, weight_cols[1:])

### last snapshot is much smaller!!!

In [None]:
for snap in snapshot_edges:
    print(len(snap))

snapshot_graphs = snapshot_graphs[:-1]
snapshot_edges = snapshot_edges[:-1]

In [None]:
attachments = observe_node_attachements_over_time(snapshot_ids, snapshot_graphs, snapshot_edges, centrality_ranks, weight_cols)

# Analyse ranks of popular nodes

In [None]:
attachments[0].head()

## a.) New attachments per weeks

   * there is a huge peek on week 5!!!

In [None]:
x = range(len(attachments))
y = [len(att) for att in attachments]
plt.plot(x,y)

## b.) Weekly correlations of popular nodes

In [None]:
pop_df = get_attachement_popularity(attachments)

In [None]:
sns.heatmap(corr_mx(pop_df, "spearman"), annot=True)

In [None]:
sns.heatmap(corr_mx(pop_df, "wkendall"), annot=True)

In [None]:
sns.heatmap(corr_mx(pop_df, "kendall"), annot=True)

## c.) Weekly correlations of popular and most central nodes

In [None]:
sp = pop_corr_with_centralities(pop_df, centrality_scores, weight_cols, method="spearman")
ke = pop_corr_with_centralities(pop_df, centrality_scores, weight_cols, method="kendall")
wk = pop_corr_with_centralities(pop_df, centrality_scores, weight_cols, method="wkendall")

### i.) mean correlation for the observed weeks

#### Observations

   - popular nodes correlates the most with capacity weighted betweeness nodes (and high degree nodes)
   - our suggestion would be to connect to 'betw_fee_base_msat', 'betw_fee_rate_milli_msat' nodes - **but later these nodes could raise the fees?**
   - **'betw_fee_base_msat' is in top2 for kendall!!!** - interesting

In [None]:
sp.mean(axis=0).sort_values(ascending=False)

In [None]:
wk.mean(axis=0).sort_values(ascending=False)

In [None]:
ke.mean(axis=0).sort_values(ascending=False)

### ii.) weekly correlation timeseries

In [None]:
for c in ["betw","in_deg","out_deg","pr"]:
    plot_corr_time_series_with_pop(sp, ke, wk, [c+postfix for postfix in ["","_capacity","_fee_base_msat", "_fee_rate_milli_msat"]])

### Most popular nodes:

In [None]:
pop_df.head()

In [None]:
most_pop = pop_df.rank(ascending=False).mean(axis=1).sort_values()[:50]

In [None]:
most_pop.head(10)

In [None]:
pop_df["most_pop"] = most_pop

In [None]:
pop_df.sort_values("most_pop").reset_index().to_csv("/mnt/idms/fberes/data/bitcoin_ln_research/most_pop_nodes.csv", index=False)

In [None]:
def get_cent_rank(node, cent, weight, idx):
    rank = -1
    try:
        rank = centrality_ranks[weight][idx].set_index("node_pub").loc[node][cent]
    finally:
        return rank 

### Betweeness ranks of most popular nodes on weeks 1-5-8:

#### Huge gain in betweeness:  (ALREADY CHANGED!!!)

- 27 - 8 - 8 (5th)
- 157 - 6 - 9 (11th)
- 25 - 16 - 14 (16th)
- 51 - 30 - 22 (21th)

etc.

#### Ideas:

   - node2vec-el klasszifikáljuk a betweenessben sokat javuló csúcsokat?

In [None]:
c_key = "betw"
weight = None
for idx, node in enumerate(most_pop.index):
    print(node,idx+1,"# %i - %i - %i" % (get_cent_rank(node, c_key, weight, 0), get_cent_rank(node, c_key, weight, 4), get_cent_rank(node, c_key, weight, 7)))

c_key = "betw"
#weight = "capacity"
#weight = "fee_base_msat"
weight = "fee_rate_milli_msat"
for idx, node in enumerate(most_pop.index):
    print(node,idx+1,"# %i - %i - %i" % (get_cent_rank(node, c_key, weight, 0), get_cent_rank(node, c_key, weight, 4), get_cent_rank(node, c_key, weight, 7)))