In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import pandas as pd
from scipy.stats import spearmanr, kendalltau, weightedtau
import seaborn as sns

In [None]:
from ln_utils import *

In [None]:
%matplotlib inline

# Parameters

is_directed = True
time_window = 86400*7

# Load temporal data

In [None]:
graph_files = []

In [None]:
data_dir = "../LNdata/lncaptures/lngraph/2019/"
graph_files +=  [data_dir + f for f in sorted(os.listdir(data_dir)) if ".json" in f]
MIN_TIME = 1549065601-86400 #Saturday, February 2, 2019 12:00:01 AM
#MAX_TIME = 1552867201 #Monday, March 18, 2019 12:00:01 AM

In [None]:
data_dir = "../LNdata/"
#graph_files = [data_dir + f for f in sorted(os.listdir(data_dir)) if ".json" in f]
graph_files += [data_dir + f for f in sorted(os.listdir(data_dir)) if ".json" in f][5:]
#MIN_TIME = 1552478399 # Wednesday, March 13, 2019 11:59:59 AM
MAX_TIME = 1553947199 # Saturday, March 30, 2019 11:59:59 AM

In [None]:
K = None#20#10
if K != None:
    graph_files = graph_files[:K]
#graph_files

In [None]:
EDGE_KEYS = ["node1_pub","node2_pub","last_update","capacity","channel_id",'node1_policy','node2_policy']
nodes, edges = load_temp_data(graph_files[:-1], edge_keys=EDGE_KEYS)
print(len(nodes), len(edges))

In [None]:
edges.head(3)

In [None]:
nodes = nodes[(nodes["last_update"] > MIN_TIME) & (nodes["last_update"] < MAX_TIME)]
edges = edges[(edges["last_update"] > MIN_TIME) & (edges["last_update"] < MAX_TIME)]
len(nodes), len(edges)

In [None]:
edges = edges.sort_values("last_update").reset_index(drop=True)

In [None]:
edges.isnull().sum()

In [None]:
edges.iloc[0]["node1_policy"]

# Extract homophily and new channels

- time of an edge channel is the 'last_update' timestamp
- we suppose: first occurrence of a channel is the creation time -> **first last_update value**

In [None]:
len(edges)

### Pre-filtering for duplicated channels ids

- we filter for the first occurrence of each channel

In [None]:
print(len(edges))
edges = edges.drop_duplicates(subset="channel_id", keep="first")
print(len(edges))

In [None]:
from tqdm import tqdm

def discover_changes(edge_updates_df):
    edge_updates_df["capacity"] = edge_updates_df["capacity"].astype("float64")
    channel_state = {}
    channel_nodes = {}
    channel_events = []
    policy_events = []
    seen_nodes, seen_edges = set(), set()
    indices = edge_updates_df.index
    for idx in tqdm(indices, mininterval=10):
        row = edge_updates_df.loc[idx]
        # channel events
        n1p, n2p, chan_id, last_update, cap = row["node1_pub"], row["node2_pub"], row["channel_id"], row["last_update"], row["capacity"]
        is_new_channel = chan_id not in channel_state
        if (n1p,n2p) in seen_edges or (n2p,n1p) in seen_edges:
            is_new_edge = False
        else:
            is_new_edge = True
            seen_edges.add((n1p,n2p))
        if n1p in seen_nodes and n2p in seen_nodes:
            is_homophily = True
        else:
            is_homophily = False
            seen_nodes.add(n1p)
            seen_nodes.add(n2p)
        cap_change = 0
        if not is_new_channel:
            cap_change = cap - channel_state[chan_id]
        else:
            channel_nodes[chan_id] = (n1p,n2p)
        channel_state[chan_id] = cap
        channel_events.append([last_update, chan_id, is_new_channel, is_new_edge, is_homophily, cap, cap_change])
        # policy events
        n1_pol, n2_pol = row["node1_policy"], row["node2_policy"]
        if n1_pol != None:
            n1_pol["node"] = n1p
            n1_pol["channel_id"] = chan_id
            n1_pol["new_channel"] = is_new_channel
            n1_pol["time"] = last_update
            policy_events.append(n1_pol)
        if n2_pol != None:
            n2_pol["node"] = n2p
            n2_pol["channel_id"] = chan_id
            n2_pol["new_channel"] = is_new_channel
            n2_pol["time"] = last_update
            policy_events.append(n2_pol)
    channel_events_df = pd.DataFrame(channel_events, columns=["time","channel_id","new_channel","new_edge","homophily","capacity","cap_diff"])
    return channel_events_df, channel_nodes, pd.DataFrame(policy_events)

In [None]:
events, channel_nodes, policy_events_df = discover_changes(edges)

In [None]:
events.head()

In [None]:
G = nx.Graph()
_ = G.add_edges_from(list(channel_nodes.values()))

In [None]:
G.number_of_nodes(), G.number_of_edges()

In [None]:
events["new_channel"].value_counts()

In [None]:
events["new_edge"].value_counts()

In [None]:
events["homophily"].value_counts()

In [None]:
events.to_csv("/mnt/idms/fberes/data/bitcoin_ln_research/directed_graphs/channel_events_%s.csv" % str(K), index=False)

# Link prediction

## Filter for homophily edges

In [None]:
new_channels = events[events["new_channel"] & events["new_edge"] & events["homophily"]]
new_channels.shape

start = new_channels["time"].min()
split = start + 86400

## Train and evaluate link prediction with bi-directional edges

# TODO: this step will boost online and time-decayed based methods!!! We should train these models 2-step wise!!!???

In [None]:
new_channels["rnd"] = np.random.random(size=len(new_channels))

In [None]:
link_pred_edges = []
for idx, row in new_channels.iterrows():
    t = row["time"]
    n1, n2 = channel_nodes[row["channel_id"]]
    if row["rnd"] < 0.5:
        link_pred_edges.append((n1,n2,t,1))
        link_pred_edges.append((n2,n1,t,0))
    else:
        link_pred_edges.append((n2,n1,t,1))
        link_pred_edges.append((n1,n2,t,0))

In [None]:
links_df = pd.DataFrame(link_pred_edges, columns=["user","item","time","eval"])

In [None]:
nodes = set(links_df["user"]).union(set(links_df["item"]))
recoder = dict(zip(nodes,range(len(nodes))))
links_df["user"] = links_df["user"].apply(lambda x: recoder[x])
links_df["item"] = links_df["item"].apply(lambda x: recoder[x])

### i.) Train models

In [None]:
k = 20#100
seed = 254938879
dim = 10
neg_rate = 100
ex_known = False

In [None]:
from alpenglow.experiments import FactorExperiment, BatchFactorExperiment, BatchAndOnlineFactorExperiment, PopularityExperiment, PopularityTimeframeExperiment
from alpenglow.evaluation import DcgScore

In [None]:
factor_model_experiment = FactorExperiment(
    top_k=k,
    seed=seed,
    dimension=dim,
    learning_rate=0.14,
    negative_rate=neg_rate
)

on_rankings = factor_model_experiment.run(links_df, exclude_known=ex_known, verbose=True)
on_rankings['dcg'] = DcgScore(on_rankings)

In [None]:
batch_model_experiment = BatchFactorExperiment(
    top_k=k,
    seed=seed,
    dimension=dim,
    learning_rate=0.05,
    negative_rate=neg_rate
)

off_rankings = batch_model_experiment.run(links_df, exclude_known=ex_known, verbose=True)
off_rankings['dcg'] = DcgScore(off_rankings)

In [None]:
onoff_model_experiment = BatchAndOnlineFactorExperiment(
    top_k=k,
    seed=seed,
    dimension=dim,
    learning_rate=0.14,
    negative_rate=neg_rate
)

onoff_rankings = onoff_model_experiment.run(links_df, exclude_known=ex_known, verbose=True)
onoff_rankings['dcg'] = DcgScore(onoff_rankings)

In [None]:
pop_experiment = PopularityExperiment(
    top_k=k,
    seed=seed,
)

pop_rankings = pop_experiment.run(links_df, exclude_known=ex_known, verbose=True)
pop_rankings['dcg'] = DcgScore(pop_rankings)

In [None]:
pop_t_experiment = PopularityTimeframeExperiment(
    top_k=k,
    seed=seed,
)

pop_t_rankings = pop_t_experiment.run(links_df, exclude_known=ex_known, verbose=True)
pop_t_rankings['dcg'] = DcgScore(pop_t_rankings)

### ii.) Evaluate

In [None]:
labels = ["online","batch","batch+online","pop","pop+time"]
rankings = [on_rankings, off_rankings, onoff_rankings, pop_rankings, pop_t_rankings]

### a.) average performance (online DCG)

The average performance for the offline batch model is confusing (it is only bad on the first day)

In [None]:
[df["dcg"].mean() for df in rankings]

In [None]:
[len(df) for df in rankings]

### b.) Performance over time

In [None]:
day = 86400*7

for idx, rankings in enumerate(rankings):
    averages = rankings['dcg'].groupby((rankings['time']-rankings['time'].min())//day).mean()
    plt.plot(averages, label=labels[idx])
plt.legend()

### c.) Number of records over time

In [None]:
cnt = on_rankings['dcg'].groupby((on_rankings['time']-on_rankings['time'].min())//day).count()
plt.plot(cnt)