In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 

In [None]:
import sys
from datawand.parametrization import ParamHelper
ph = ParamHelper('..', 'LinkPrediction', sys.argv)

In [None]:
links_df = pd.read_csv("/mnt/idms/fberes/data/bitcoin_ln_research/link_prediction/data/links_df_20.csv")

In [None]:
links_df.head()

# Parameters

In [None]:
K = 20#None#30#ph.get("top_first_days")

In [None]:
if K == None:
    delta_t = 86400*7
else:
    delta_t = 86400#

In [None]:
models = [
    "onmf_dim10_lr0.140_nr100",
    "bomf_dim10_lr0.140_nr100",
    "offmf_dim10_lr0.050_nr100",
    "pop",
    "time_pop"
]

# Rankings

In [None]:
ranking_dir = "/mnt/idms/fberes/data/bitcoin_ln_research/link_prediction/rankings/topk20_exkTrue_%s/" % str(K)

In [None]:
rankings = [pd.read_csv("%s/%s.csv" % (ranking_dir,m)) for m in models]

In [None]:
[len(df) for df in rankings]

In [None]:
rankings[0]['time'].min()

In [None]:
START_TIME = 1548982800 # (GMT): Friday, February 1, 2019 1:00:00 AM

In [None]:
def get_timeframe(df, delta_t, min_time=1548982800):
    df["timeframe"] = df["time"].apply(lambda x: max(0,(x-min_time)//delta_t))

for i in range(len(rankings)):
    get_timeframe(rankings[i], delta_t)

In [None]:
rankings[4].isnull().sum()

# Results

### a.) average performance (online DCG)

The average performance for the offline batch model is confusing (it is only bad on the first day)

In [None]:
def show_mean_dcg(with_first_day=True):
    if with_first_day:
        mean_dcgs = [df["dcg"].mean() for df in rankings]
        df = rankings[0]
        print(len(df))
    else:
        mean_dcgs = [df[df["timeframe"]>0]["dcg"].mean() for df in rankings]
        df = rankings[0]
        print(len(df[df["timeframe"]>0]))
    return pd.DataFrame(list(zip(models, mean_dcgs)), columns=["model","dcg"]).sort_values("dcg", ascending=False).reset_index(drop=True)

#### Global mean performance

In [None]:
show_mean_dcg(True)

#### Mean performance without first day

In [None]:
show_mean_dcg(False)

**Exclude known: False**
0 	online 	0.139660
1 	batch+online 	0.131745
2 	pop+time 	0.124183
3 	pop 	0.077110
4 	batch 	0.064587

**Exclude known: True - Miért teljesen uaz?**
0 	online 	0.139660
1 	batch+online 	0.131745
2 	pop+time 	0.124183
3 	pop 	0.077110
4 	batch 	0.064587

### b.) Performance over time

In [None]:
for idx, ranking in enumerate(rankings):
    averages = ranking.groupby("timeframe")["dcg"].mean()
    plt.plot(averages, label=models[idx])
plt.legend()

### c.) Number of records over time

In [None]:
cnt = rankings[0].groupby("timeframe")["dcg"].count()
plt.plot(cnt)

# Simulation based results

In [None]:
experiment_id = "200000sat_k10000_aNone_e0.05_dropTrue-onmf_dim10_lr0.140_nr100"
model_dir = "%s/%s/" % (ranking_dir, experiment_id)

In [None]:
model_files = os.listdir(model_dir)

In [None]:
len(model_files)

In [None]:
chunks = [pd.read_csv("%s/%s" % (model_dir, f)) for f in model_files]

In [None]:
chunks[0].head()

In [None]:
sim_preds = pd.concat(chunks)

In [None]:
get_timeframe(sim_preds, delta_t)

In [None]:
sim_preds.isnull().sum() / len(sim_preds)

In [None]:
sim_preds_ids = set(sim_preds["record_id"])

In [None]:
link_preds_ids = set(rankings[0]["id"])

In [None]:
len(sim_preds_ids.difference(link_preds_ids))

#### Mean DCG

In [None]:
from alpenglow.evaluation import DcgScore

In [None]:
sim_preds['dcg'] = DcgScore(sim_preds)

In [None]:
sim_preds['dcg'].mean()

In [None]:
sim_preds['base_dcg'] = DcgScore(sim_preds.drop("rank",axis=1).rename({"base":"rank"}, axis=1))

In [None]:
sim_preds['base_dcg'].mean()

### Joining baselines with simulation results

In [None]:
for idx, model in enumerate(models):
    sim_preds = sim_preds.merge(rankings[idx][["id","dcg"]].rename({"id":"record_id","dcg":model}, axis=1), on="record_id", how="left")

In [None]:
sim_preds[["dcg",'base_dcg']+models].mean().sort_values(ascending=False)

### Experiment nodes for [1,2] snapshots

#### with dict ordering
- onmf_dim10_lr0.140_nr100     0.086728
- bomf_dim10_lr0.140_nr100     0.086140
- time_pop                     0.086084
- pop                          0.075682
- dcg                          0.071642
- offmf_dim10_lr0.050_nr100    0.061430

#### with dataframe ordering

- onmf_dim10_lr0.140_nr100     0.086728
- bomf_dim10_lr0.140_nr100     0.086140
- time_pop                     0.086084
- pop                          0.075682
- dcg                          0.069946
- offmf_dim10_lr0.050_nr100    0.061430
- base_dcg                     0.059996

#### with dataframe ordering + (no existing node exclusion)

- base_dcg                     0.087428
- onmf_dim10_lr0.140_nr100     0.086728
- bomf_dim10_lr0.140_nr100     0.086140
- time_pop                     0.086084
- dcg                          0.077530
- pop                          0.075682
- offmf_dim10_lr0.050_nr100    0.061430

In [None]:
sim_preds.head()

In [None]:
for model in models:
    averages = sim_preds.groupby("timeframe")[model].mean()
    plt.plot(averages, label=model)
averages = sim_preds.groupby("timeframe")["dcg"].mean()
plt.plot(averages, label='simultion')
plt.legend()

In [None]:
averages = sim_preds.groupby("timeframe")["dcg"].mean()
plt.plot(averages, label='simultion')
averages = sim_preds.groupby("timeframe")["base_dcg"].mean()
plt.plot(averages, label='onmf_preds')
plt.legend()