In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import pandas as pd
from scipy.stats import spearmanr, kendalltau, weightedtau
import seaborn as sns

In [None]:
from ln_utils import *

In [None]:
%matplotlib inline

# Parameters

is_directed = True
time_window = 86400*7

# Load temporal data

In [None]:
graph_files = []

In [None]:
data_dir = "../LNdata/lncaptures/lngraph/2019/"
graph_files +=  [data_dir + f for f in sorted(os.listdir(data_dir)) if ".json" in f]
MIN_TIME = 1549065601-86400 #Saturday, February 2, 2019 12:00:01 AM
#MAX_TIME = 1552867201 #Monday, March 18, 2019 12:00:01 AM

In [None]:
data_dir = "../LNdata/"
#graph_files = [data_dir + f for f in sorted(os.listdir(data_dir)) if ".json" in f]
graph_files += [data_dir + f for f in sorted(os.listdir(data_dir)) if ".json" in f][5:]
#MIN_TIME = 1552478399 # Wednesday, March 13, 2019 11:59:59 AM
MAX_TIME = 1553947199 # Saturday, March 30, 2019 11:59:59 AM

In [None]:
graph_files = graph_files[:10]
#graph_files

In [None]:
EDGE_KEYS = ["node1_pub","node2_pub","last_update","capacity","channel_id",'node1_policy','node2_policy']
nodes, edges = load_temp_data(graph_files[:-1], edge_keys=EDGE_KEYS)
print(len(nodes), len(edges))

In [None]:
edges.head(3)

In [None]:
nodes = nodes[(nodes["last_update"] > MIN_TIME) & (nodes["last_update"] < MAX_TIME)]
edges = edges[(edges["last_update"] > MIN_TIME) & (edges["last_update"] < MAX_TIME)]
len(nodes), len(edges)

In [None]:
edges = edges.sort_values("last_update")

In [None]:
edges.isnull().sum()

In [None]:
edges.iloc[0]["node1_policy"]

In [None]:
def discover_changes(edge_updates_df):
    edge_updates_df["capacity"] = edge_updates_df["capacity"].astype("float64")
    channel_state = {}
    channel_nodes = {}
    channel_events = []
    policy_events = []
    for idx, row in edge_updates_df.iterrows():
        # channel events
        n1p, n2p, chan_id, last_update, cap = row["node1_pub"], row["node2_pub"], row["channel_id"], row["last_update"], row["capacity"]
        is_new_channel = chan_id not in channel_state
        cap_change = 0
        if not is_new_channel:
            cap_change = cap - channel_state[chan_id]
        else:
            channel_nodes[chan_id] = (n1p,n2p)
        channel_state[chan_id] = cap
        channel_events.append([last_update, chan_id, is_new_channel, cap, cap_change])
        # policy events
        n1_pol, n2_pol = row["node1_policy"], row["node2_policy"]
        if n1_pol != None:
            n1_pol["node"] = n1p
            n1_pol["channel_id"] = chan_id
            n1_pol["new_channel"] = is_new_channel
            n1_pol["time"] = last_update
            policy_events.append(n1_pol)
        if n2_pol != None:
            n2_pol["node"] = n2p
            n2_pol["channel_id"] = chan_id
            n2_pol["new_channel"] = is_new_channel
            n2_pol["time"] = last_update
            policy_events.append(n2_pol)
    channel_events_df = pd.DataFrame(channel_events, columns=["time","channel_id","is_new","capacity","cap_diff"])
    return channel_events_df, channel_nodes, pd.DataFrame(policy_events)

In [None]:
events, channel_nodes, policy_events_df = discover_changes(edges)

In [None]:
events.head()

In [None]:
G = nx.Graph()
_ = G.add_edges_from(list(channel_nodes.values()))

In [None]:
G.number_of_nodes(), G.number_of_edges()

In [None]:
degs = dict(G.degree())
pr = nx.pagerank(G)
betw = nx.betweenness_centrality(G)

# Policy Changes

policy_events_df = policy_events_df[~policy_events_df["disabled"]]

In [None]:
print(policy_events_df.shape)
policy_events_df.head()

### Number of times the policy was changed for a channel node

In [None]:
#only_pol_updates = policy_events_df
only_pol_updates = policy_events_df[~policy_events_df["new_channel"]]

In [None]:
len(policy_events_df), len(only_pol_updates)

In [None]:
#{"fee_base_msat":1000,"fee_rate_milli_msat":1,"min_htlc":1000}
pricing_cols = ["fee_base_msat","fee_rate_milli_msat","min_htlc"]
for col in pricing_cols:
    policy_events_df[col] = policy_events_df[col].astype("float64")

In [None]:
node_mean_pricing = only_pol_updates.groupby("node")[pricing_cols].mean().reset_index()

node_mean_pricing["min_htlc"].value_counts()

In [None]:
node_chan_changes = only_pol_updates.groupby(["node","channel_id"])["time"].count().reset_index()

In [None]:
node_chan_changes["time"].value_counts()[:10]

In [None]:
node_stats = node_chan_changes.groupby("node")["time"].mean().reset_index()
node_stats.columns = ["node","mean_num_policy_changes"]
node_stats["degree"] = node_stats["node"].apply(lambda x: degs.get(x,0.0))
node_stats["pr"] = node_stats["node"].apply(lambda x: pr.get(x,0.0))
node_stats["betw"] = node_stats["node"].apply(lambda x: betw.get(x,0.0))
node_stats = node_stats.merge(node_mean_pricing, on="node", how="left")
node_stats = node_stats.set_index("node")
node_stats.head()

In [None]:
node_stats.corr(method="spearman")

In [None]:
sns.jointplot(data=node_stats, x="mean_num_policy_changes", y="degree")

# Channel changes

In [None]:
events["is_new"].value_counts()

In [None]:
(events["cap_diff"] != 0).value_counts()

In [None]:
events.head()

events.to_csv("channel_events.csv", index=False)

# Link prediction

# TODO: predict the change in channel capacity!!!

In [None]:
import alpenglow

In [None]:
new_channels = events[events["is_new"]]
new_channels.shape

start = new_channels["time"].min()
split = start + 86400

In [None]:
link_pred_edges = []
for idx, row in new_channels.iterrows():
    t = row["time"]
    n1, n2 = channel_nodes[row["channel_id"]]
    link_pred_edges.append((n1,n2,t))
    link_pred_edges.append((n2,n1,t))

In [None]:
links_df = pd.DataFrame(link_pred_edges, columns=["user","item","time"])

In [None]:
nodes = set(links_df["user"]).union(set(links_df["item"]))
recoder = dict(zip(nodes,range(len(nodes))))
links_df["user"] = links_df["user"].apply(lambda x: recoder[x])
links_df["item"] = links_df["item"].apply(lambda x: recoder[x])

In [None]:
from alpenglow.experiments import FactorExperiment
from alpenglow.evaluation import DcgScore

factor_model_experiment = FactorExperiment(
    top_k=100,
    seed=254938879,
    dimension=10,
    learning_rate=0.14,
    negative_rate=100
)

rankings = factor_model_experiment.run(links_df, exclude_known=False, verbose=True)
rankings['dcg'] = DcgScore(rankings)

### It is a bit high! no surprise.. :)

In [None]:
rankings['dcg'].mean()

In [None]:
day = 86400
averages = rankings['dcg'].groupby((rankings['time']-rankings['time'].min())//day).mean()
plt.plot(averages)

In [None]:
cnt = rankings['dcg'].groupby((rankings['time']-rankings['time'].min())//day).count()
plt.plot(cnt)