In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 

In [None]:
import sys
from datawand.parametrization import ParamHelper
ph = ParamHelper('..', 'LinkPrediction', sys.argv)

In [None]:
links_df = pd.read_csv("/mnt/idms/fberes/data/bitcoin_ln_research/link_prediction/data/links_df_20.csv")

In [None]:
links_df.head()

# Parameters

In [None]:
K = 20#None#30#ph.get("top_first_days")

In [None]:
if K == None:
    delta_t = 86400*7
else:
    delta_t = 86400#

In [None]:
models = [
    "onmf_dim10_lr0.140_nr100",
    "bomf_dim10_lr0.140_nr100",
    "offmf_dim10_lr0.050_nr100",
    "pop",
    "time_pop"
]

# Rankings

In [None]:
ranking_dir = "/mnt/idms/fberes/data/bitcoin_ln_research/link_prediction/rankings/topk20_exkTrue_%s/" % str(K)

In [None]:
rankings = [pd.read_csv("%s/%s.csv" % (ranking_dir,m)) for m in models]

In [None]:
[len(df) for df in rankings]

In [None]:
rankings[0]['time'].min()

In [None]:
START_TIME = 1548982800 # (GMT): Friday, February 1, 2019 1:00:00 AM

In [None]:
def get_timeframe(df, delta_t, min_time=1548982800):
    df["timeframe"] = df["time"].apply(lambda x: max(0,(x-min_time)//delta_t))

for i in range(len(rankings)):
    get_timeframe(rankings[i], delta_t)

In [None]:
rankings[4].isnull().sum()

# Results

### a.) average performance (online DCG)

The average performance for the offline batch model is confusing (it is only bad on the first day)

In [None]:
def show_mean_dcg(with_first_day=True):
    if with_first_day:
        mean_dcgs = [df["dcg"].mean() for df in rankings]
        df = rankings[0]
        print(len(df))
    else:
        mean_dcgs = [df[df["timeframe"]>0]["dcg"].mean() for df in rankings]
        df = rankings[0]
        print(len(df[df["timeframe"]>0]))
    return pd.DataFrame(list(zip(models, mean_dcgs)), columns=["model","dcg"]).sort_values("dcg", ascending=False).reset_index(drop=True)

#### Global mean performance

In [None]:
show_mean_dcg(True)

#### Mean performance without first day

In [None]:
show_mean_dcg(False)

**Exclude known: False**
0 	online 	0.139660
1 	batch+online 	0.131745
2 	pop+time 	0.124183
3 	pop 	0.077110
4 	batch 	0.064587

**Exclude known: True - Miért teljesen uaz?**
0 	online 	0.139660
1 	batch+online 	0.131745
2 	pop+time 	0.124183
3 	pop 	0.077110
4 	batch 	0.064587

### b.) Performance over time

In [None]:
for idx, ranking in enumerate(rankings):
    averages = ranking.groupby("timeframe")["dcg"].mean()
    plt.plot(averages, label=models[idx])
plt.legend()

### c.) Number of records over time

In [None]:
cnt = rankings[0].groupby("timeframe")["dcg"].count()
plt.plot(cnt)

# Simulation based results

experiment_id = "200000sat_k10000_aNone_e0.05_dropTrue-onmf_dim10_lr0.140_nr100"

In [None]:
from alpenglow.evaluation import DcgScore

def load_link_sim_experiment(model_dir):
    model_id = model_dir.split("/")[-2]
    print(model_id)
    model_files = os.listdir(model_dir)
    chunks = [pd.read_csv("%s/%s" % (model_dir, f)) for f in model_files]
    concatenated = pd.concat(chunks)
    print(len(model_files), len(concatenated))
    get_timeframe(concatenated, delta_t)
    #print(concatenated.isnull().sum() / len(concatenated))
    concatenated[model_id] = DcgScore(concatenated)
    print(concatenated[model_id].mean())
    concatenated['base_dcg'] = DcgScore(concatenated.drop("rank",axis=1).rename({"base":"rank"}, axis=1))
    print(concatenated["base_dcg"].mean())
    return concatenated.drop("base_dcg", axis=1)

In [None]:
experiments = [
    "200000sat_k10000_aNone_e0.05_dropTrue-onmf_dim10_lr0.140_nr100",
    "200000sat_k10000_aNone_e0.05_dropFalse-onmf_dim10_lr0.140_nr100",
    "200000sat_k10000_a2.0_e0.05_dropTrue-onmf_dim10_lr0.140_nr100"
]

In [None]:
simulation_results = [load_link_sim_experiment("%s/%s/" % (ranking_dir, experiment_id)) for experiment_id in experiments]

In [None]:
sim_preds = simulation_results[0]

In [None]:
for idx, model in enumerate(experiments[1:]):
    sim_preds = sim_preds.merge(simulation_results[idx+1][["record_id",model]], on="record_id", how="left")

In [None]:
sim_preds.head()

### Joining baselines with simulation results

In [None]:
for idx, model in enumerate(models):
    sim_preds = sim_preds.merge(rankings[idx][["id","dcg"]].rename({"id":"record_id","dcg":model}, axis=1), on="record_id", how="left")

In [None]:
sim_preds[experiments+models].mean().sort_values(ascending=False)

### K=20

- onmf_dim10_lr0.140_nr100     0.166119
- base_dcg                     0.166119
- bomf_dim10_lr0.140_nr100     0.155521
- dcg (drop disabled True)     0.140450
- time_pop                     0.131762
- pop                          0.076994
- offmf_dim10_lr0.050_nr100    0.076188

In [None]:
sim_preds.head()

In [None]:
for model in ["onmf_dim10_lr0.140_nr100","time_pop","200000sat_k10000_aNone_e0.05_dropTrue-onmf_dim10_lr0.140_nr100"]:
    averages = sim_preds.groupby("timeframe")[model].mean()
    plt.plot(averages, label=model)
plt.legend()

(sim_preds["dcg"] > sim_preds["base_dcg"]).value_counts()

sim_is_better = sim_preds[sim_preds["dcg"] > sim_preds["base_dcg"]]

mf_is_better = sim_preds[sim_preds["dcg"] < sim_preds["base_dcg"]]

sim_is_better["snapshot"].value_counts()

(sim_is_better["dcg"] - sim_is_better["base_dcg"]).mean()

(mf_is_better["dcg"] - mf_is_better["base_dcg"]).mean()

#### Sim is better top10

- LNBIG.com [lnd-06]
- LNBIG.com [lnd-32]
- LNBIG.com [lnd-05]
- LNBIG.com [lnd-28/old-lnd-22]
- LNBIG.com [lnd-26]
- LNBIG.com [lnd-27/old-lnd-19]
- LNBIG.com [lnd-17]
- LNBIG.com [lnd-07]
- LNBIG.com [lnd-21]
- LNBIG.com [lnd-33]

sim_is_better["src"].value_counts()[:10]

#### MF is better top10

- LNBIG.com [lnd-17]
- LNBIG.com [lnd-32]
- LNBIG.com [lnd-27/old-lnd-19]
- LNBIG.com [lnd-33]
- ... all is LNBIG

mf_is_better["src"].value_counts()[:10]

sim_preds.dtypes

sim_preds.isnull().sum()

sim_preds["diff"] = sim_preds["dcg"] - sim_preds["base_dcg"]

mean_diff = sim_preds.groupby("src")["diff"].mean().sort_values()

#### MF is much better in average

- MJOLNIR
- DMN737
- just a hash
- sqlserver.science
- Ziggy [LND]

They have just a few dollards

mean_diff.head()

mean_diff.tail()

LightningPowerUsers.com

mean_diff["0331f80652fb840239df8dc99205792bba2e559a05469915804c08420230e23c7c"]

ACINQ

mean_diff["03864ef025fde8fb587d989186ce6a4a186895ee44a926bfc370e2c366597a3f8f"]