In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

In [None]:
from alpenglow.evaluation import DcgScore

In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 

In [None]:
import sys
from datawand.parametrization import ParamHelper
ph = ParamHelper('..', 'LinkPrediction', sys.argv)

In [None]:
links_df = pd.read_csv("/mnt/idms/fberes/data/bitcoin_ln_research/link_prediction/data/links_df_20.csv")

In [None]:
links_df.head()

# Parameters

In [None]:
K = 20#None#30#ph.get("top_first_days")
top_k = 500#100

In [None]:
if K == None:
    delta_t = 86400*7
else:
    delta_t = 86400#

In [None]:
models = [
    "onmf_dim10_lr0.140_nr100",
    #"bomf_dim10_lr0.140_nr100",
    #"offmf_dim10_lr0.050_nr100",
    "pop",
    "time_pop"
]

# Rankings

In [None]:
ranking_dir = "/mnt/idms/fberes/data/bitcoin_ln_research/link_prediction/rankings/topk%i_exkTrue_%s/" % (top_k,str(K))

In [None]:
rankings = [pd.read_csv("%s/%s.csv" % (ranking_dir,m)) for m in models]

In [None]:
[len(df) for df in rankings]

In [None]:
rankings[0]['time'].min()

In [None]:
START_TIME = 1548982800 # (GMT): Friday, February 1, 2019 1:00:00 AM

In [None]:
def get_timeframe(df, delta_t, min_time=1548982800):
    df["timeframe"] = df["time"].apply(lambda x: max(0,(x-min_time)//delta_t))

for i in range(len(rankings)):
    get_timeframe(rankings[i], delta_t)

# Results

### a.) average performance (online DCG)

The average performance for the offline batch model is confusing (it is only bad on the first day)

In [None]:
def show_mean_dcg(with_first_day=True):
    if with_first_day:
        mean_dcgs = [df["dcg"].mean() for df in rankings]
        df = rankings[0]
        print(len(df))
    else:
        mean_dcgs = [df[df["timeframe"]>0]["dcg"].mean() for df in rankings]
        df = rankings[0]
        print(len(df[df["timeframe"]>0]))
    return pd.DataFrame(list(zip(models, mean_dcgs)), columns=["model","dcg"]).sort_values("dcg", ascending=False).reset_index(drop=True)

#### Global mean performance

In [None]:
show_mean_dcg(True)

#### Mean performance without first day

In [None]:
show_mean_dcg(False)

**Exclude known: False**
0 	online 	0.139660
1 	batch+online 	0.131745
2 	pop+time 	0.124183
3 	pop 	0.077110
4 	batch 	0.064587

**Exclude known: True - Miért teljesen uaz?**
0 	online 	0.139660
1 	batch+online 	0.131745
2 	pop+time 	0.124183
3 	pop 	0.077110
4 	batch 	0.064587

### b.) Performance over time

In [None]:
for idx, ranking in enumerate(rankings):
    averages = ranking.groupby("timeframe")["dcg"].mean()
    plt.plot(averages, label=models[idx])
plt.legend()

### c.) Number of records over time

In [None]:
cnt = rankings[0].groupby("timeframe")["dcg"].count()
plt.plot(cnt)

# Simulation based results

In [None]:
link_sim_cols = ["opt_income","global_traffic","global_income","inbound_depletions","high_degree","high_cap"]

In [None]:
def load_link_sim_experiment(model_dir):
    model_id = model_dir.split("/")[-2]
    print(model_id)
    model_files = os.listdir(model_dir)
    chunks = [pd.read_csv("%s/%s" % (model_dir, f)) for f in model_files if "snapshot" in f]
    concatenated = pd.concat(chunks)
    print(len(model_files), len(concatenated))
    get_timeframe(concatenated, delta_t)
    for rank_col in link_sim_cols:
        print(rank_col)
        concatenated["dcg_"+rank_col] = DcgScore(concatenated.rename({rank_col:"rank"}, axis=1))
        print(concatenated["dcg_"+rank_col].mean())
    return concatenated

### Load simulation results

In [None]:
experiments = ["50000sat_k6000_aNone_e0.05_dropTrue"]

In [None]:
simulation_results = [load_link_sim_experiment("%s/%s/" % (ranking_dir, experiment_id)) for experiment_id in experiments]

### Join all simulation experiments into one dataframe

In [None]:
sim_preds = simulation_results[0]

In [None]:
for idx, model in enumerate(experiments[1:]):
    print(model)
    sim_preds = sim_preds.merge(simulation_results[idx+1][["record_id",model]], on="record_id", how="left")

In [None]:
sim_preds.head()

### Joining baselines with simulation results

In [None]:
for idx, model in enumerate(models):
    sim_preds = sim_preds.merge(rankings[idx][["id","dcg"]].rename({"id":"record_id","dcg":model}, axis=1), on="record_id", how="left")

sim_preds[experiments+models].mean().sort_values(ascending=False)

In [None]:
sim_preds[models+ ["dcg_opt_income","dcg_global_traffic","dcg_global_income","dcg_inbound_depletions","dcg_high_degree","dcg_high_cap"]].mean().sort_values(ascending=False)

# Rank combination

In [None]:
onmf_preds = pd.read_csv("%s/preds_onmf_dim10_lr0.140_nr100.csv" % ranking_dir)
onmf_preds = onmf_preds.drop("dcg", axis=1)
onmf_preds.shape

In [None]:
tpop_preds = pd.read_csv("%s/preds_time_pop.csv" % ranking_dir)
tpop_preds = tpop_preds.drop("dcg", axis=1)
tpop_preds.shape

In [None]:
sim_model_preds = {}

In [None]:
sim_model = experiments[0]
for m in link_sim_cols:
    tmp_parts = [pd.read_csv("%s/%s/preds_%s_%i.csv" % (ranking_dir,sim_model,m,i)) for i in range(19)]
    tmp = pd.concat(tmp_parts, ignore_index=True)
    sim_model_preds[m] = tmp
    print(m)

### filter for common records

common_record_ids = set(onmf_preds["record_id"])
len(common_record_ids)

In [None]:
common_record_ids = set(sim_model_preds["opt_income"]["record_id"])
len(common_record_ids)

In [None]:
for m in link_sim_cols:
    tmp = sim_model_preds[m]
    sim_model_preds[m] = tmp[tmp["record_id"].isin(common_record_ids)]
    print(m, sim_model_preds[m].shape)
onmf_preds = onmf_preds[onmf_preds["record_id"].isin(common_record_ids)]
print("onmf", onmf_preds.shape)
tpop_preds = tpop_preds[tpop_preds["record_id"].isin(common_record_ids)]
print("tpop", tpop_preds.shape)

### extract real targets

In [None]:
real_targets = dict(links_df.loc[common_record_ids]["item"])

In [None]:
len(real_targets)

### recoding keys to ids

In [None]:
node_2_id = dict(zip(links_df["src"],links_df["user"]))
node_2_id.update(dict(zip(links_df["trg"],links_df["item"])))

In [None]:
for m in link_sim_cols:
    tmp = sim_model_preds[m]
    tmp["user"] = tmp["user"].apply(lambda x: node_2_id[x])
    tmp["item"] = tmp["item"].apply(lambda x: node_2_id[x])
    sim_model_preds[m] = tmp
    print(m)

In [None]:
sim_model_preds["onmf"] = onmf_preds
sim_model_preds["tpop"] = tpop_preds

### combination

In [None]:
import numpy as np
import functools
import concurrent.futures

def performance(pred_df,real_targets):
    """real_targets contains true items for only the related sessions"""
    df = pred_df.copy()
    df.loc[:,"true_item"] = df["record_id"].apply(lambda x: real_targets[x])
    hits = df[df["true_item"]==df["item"]]
    hits.loc[:,"dcg"] = 1.0 / (np.log2(hits["rank"]+1.0))
    return hits["rec_rank"].sum() / len(real_targets), hits["dcg"].sum() / len(real_targets)

def extract_topk(df, key_col, score_col, k):
    groups = dict(tuple(df.groupby(key_col)))
    topk_parts = [groups[key].nlargest(k,score_col) for key in groups]
    return pd.concat(topk_parts, ignore_index=True)

def combine_ranks(preds_1, preds_2, k, alpha, trim=True):
    cols = ["record_id","item","rec_rank"]
    p1 = preds_1[cols].copy()
    p2 = preds_2[cols].copy()
    p1.loc[:,"rec_rank"] = p1["rec_rank"]*alpha
    p2.loc[:,"rec_rank"] = p2["rec_rank"]*(1.0-alpha)
    combined = pd.concat([p1,p2], ignore_index=True)
    combined = combined.groupby(["record_id","item"])["rec_rank"].sum().reset_index()
    combined = combined[combined["rec_rank"]>0]
    combined.loc[:,"rank"] = 1.0 / combined["rec_rank"]
    combined.loc[:,"dcg"] = DcgScore(combined)
    if trim:
        old_size = len(combined)
        combined = extract_topk(combined, "record_id", "rec_rank", k)
        print(alpha,len(p1),len(combined),old_size/len(p1))
    return combined

def combine_and_eval(model1, model2, true_targets, k, a):
    combi = combine_ranks(model1, model2, k, a)
    mrr, mdcg = performance(combi, true_targets)
    #print(len(combi)/len(model2))
    return mrr, mdcg

def combine_models(model1, model2, true_targets, alphas, filter_m1=False, max_threads=4):
    if filter_m1:
        m1 = model1[model1["prediction"]>0]
        print("filter",len(model1),len(m1))
    else:
        m1 = model1
    f_partial = functools.partial(combine_and_eval, m1, model2, true_targets, top_k)
    executor = concurrent.futures.ProcessPoolExecutor(max_threads)
    res = list(executor.map(f_partial, alphas))
    executor.shutdown()
    mrrs, mdcgs = zip(*res)
    #print(mrrs)
    print(mdcgs)
    #plt.figure()
    #plt.plot(alphas,mrrs,'bx',label="mrr")
    #plt.plot(alphas,mdcgs,'rx',label="mdcg")
    #plt.legend()

In [None]:
for col in sim_model_preds:
    tmp = sim_model_preds[col]
    tmp["rec_rank"] = 1.0 / tmp["rank"]
    sim_model_preds[col] = tmp
    print(col)

#### double check: single model performance

In [None]:
for col in sim_model_preds:
    print(col, performance(sim_model_preds[col], real_targets))

In [None]:
#alphas = np.arange(0,1.1,0.1)# 0.075 is best: 0.239715
alphas = [0.0,0.001,0.002,0.005,0.01,0.02,0.03,0.1,1.0]
#alphas = [0.0,0.01,0.02,0.03,0.04,0.05,0.1,0.2,0.5,1.0]
#alphas = [0.0,0.025,0.05,0.075,0.1,0.15,0.2,1.0]

#### NOTE: finally for topk=500 models can combine!

In [None]:
combine_models(tpop_preds, onmf_preds, real_targets, alphas, max_threads=len(alphas))

for col in link_sim_cols:
    print("###%s###" % col)
    combine_models(sim_model_preds[col], onmf_preds, real_targets, alphas, filter_m1=False, max_threads=len(alphas))
    print()

In [None]:
for col in link_sim_cols:
    print("###%s###" % col)
    combine_models(sim_model_preds[col], onmf_preds, real_targets, alphas, filter_m1=True, max_threads=len(alphas))
    print()

combine_models(only_sim_preds, onmf_preds, alphas)

# Betweeness baseline

from link_pred_simulator import load_graph_snapshots, process_links_for_simulator

snapshots, time_boundaries = load_graph_snapshots("/mnt/idms/fberes/data/bitcoin_ln_research/directed_graphs/directed_temporal_multi_edges_1days.csv")

links_for_sim = process_links_for_simulator(links_df, None, time_boundaries, only_eval=False, verbose=True)

links_for_sim["eval"].value_counts()

from tqdm import tqdm

def calculate_rank(scores_df, true_target, top_k, seen_nodes=None):
        # scores_df is preordered
        if seen_nodes == None:
            ordered_list = list(scores_df["index"])[:top_k]
        else:
            ordered_list = list(scores_df[~scores_df["index"].isin(seen_nodes)]["index"])[:top_k]
        return ordered_list.index(true_target)+1.0 if true_target in ordered_list else None

def extract_central_nodes(file_path, metric):
    scores_df = pd.read_csv(file_path, usecols=["index","betw"])
    return scores_df.sort_values(metric, ascending=False)[["index",metric]]
    
class BetweenessModel():
    def __init__(self, centrality_dir="/mnt/idms/fberes/data/bitcoin_ln_research/centrality_scores", weight=None):
        self.metric = "betw"
        self.weight = weight
        self.centrality_dir = centrality_dir
        self.top_k_preds = {}
        
    def run(self, links, k, exclude_known=True):
        self.graph = {}
        max_snap_id = links["snapshot"].max()
        for snap_id in range(max_snap_id+1):
            f_path = "%s/scores_%s_%i.csv" % (self.centrality_dir, self.weight, snap_id)
            self.top_k_preds[snap_id] = extract_central_nodes(f_path, self.metric)
        ranks = []
        indices = links.index
        for idx in tqdm(indices):
            row = links.loc[idx]
            snap_id, src, trg, eval_ = row["snapshot"], row["src"], row["trg"], row["eval"]
            if not src in self.graph:
                self.graph[src] = set()
            if eval_:
                seen_nodes = self.graph[src] if exclude_known else None
                ranks.append(calculate_rank(self.top_k_preds[snap_id], trg, k, seen_nodes))
            else:
                ranks.append(None)
            self.graph[src].add(trg)
        rankings = links.copy()
        rankings["rank"] = ranks
        return rankings[rankings["eval"]==1]

# How can it be the same DCG???

bm = BetweenessModel()

bm_rankings = bm.run(links_for_sim, 20, exclude_known=True)

bm_rankings["dcg"] = DcgScore(bm_rankings)

bm_rankings["dcg"].mean()

bm_rankings_f = bm.run(links_for_sim, 20, exclude_known=False)

bm_rankings["dcg"] = DcgScore(bm_rankings)

bm_rankings["dcg"].mean()

# Join

In [None]:
d1 = pd.read_csv("/mnt/idms/fberes/data/bitcoin_ln_research/link_prediction/data/channel_events_20.csv")

In [None]:
d2 = pd.read_csv("../DynamicNetworkAnalysis/ln.tsv", sep="\t", header=None)

In [None]:
d1.head()

In [None]:
d1.dtypes

In [None]:
d2.tail()

In [None]:
d2.dtypes

In [None]:
int("562210x2014x0")

In [None]:
d2[0].apply(lambdabda x: long(x))