In [40]:
import argparse
from collections import defaultdict
import json
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

from agq.metrics import get_rank_error, get_hit_ratio
from constants import DATA_STREAM_FOLDER, RANKING_FOLDER

In [41]:
args = argparse.Namespace(**{
    'path_to_scenario': '../o11ydata/scenarios/scenario_3',
    'streams_root': '',
    'aggregation_level': 'file',
    'k': -1
})

In [42]:
# load params
with open(f'{args.path_to_scenario}/params.json', 'r') as f:
    params = json.load(f)
    params.update(vars(args))
    args = argparse.Namespace(**params)

In [43]:
args.data_folder = '../o11ydata/'

In [44]:
dataframe = []
list_topK = reversed([25, 50, 75, 100, 125, 150])#, 175, 200])

In [45]:
for topK in list_topK:
    streams_filenames = os.listdir(f'{args.data_folder}{DATA_STREAM_FOLDER}/')
    if args.streams_root != '':
        streams_filenames = [e for e in streams_filenames if args.streams_root in e]

    streams_filenames = sorted(streams_filenames)

    scores = defaultdict(list)
    for filename in tqdm(streams_filenames):
        stream_name = filename.split('.csv')[0]
        truth_ranking = pd.read_csv(f'{args.data_folder}{RANKING_FOLDER}/{filename}')

        filename_scores, counter = defaultdict(list), 0
        while os.path.isfile(f'{args.path_to_scenario}/{stream_name}_ranking_{counter}.csv'):
            estimated_ranking = pd.read_csv(
                f'{args.path_to_scenario}/{stream_name}_ranking_{counter}.csv')

            if topK != -1:
                estimated_ranking = estimated_ranking.iloc[:topK]

            filename_scores['rank_error'].append(
                get_rank_error(truth_ranking=truth_ranking, estimated_ranking=estimated_ranking, weighted=False)
            )
            filename_scores['hit_ratio'].append(
                get_hit_ratio(truth_ranking=truth_ranking, estimated_ranking=estimated_ranking)
            )
            counter += 1

        for k, v in filename_scores.items():
            v_array = np.array(v)
            scores[k].append((v_array.mean(), v_array.std()))
    
    dataframe.append(["{:.2f} ({:.2f})".format(mean, std) for mean, std in scores['rank_error']])

100%|███████████████████████████████████████████| 26/26 [00:19<00:00,  1.35it/s]
100%|███████████████████████████████████████████| 26/26 [00:16<00:00,  1.61it/s]
100%|███████████████████████████████████████████| 26/26 [00:13<00:00,  1.87it/s]
100%|███████████████████████████████████████████| 26/26 [00:12<00:00,  2.14it/s]
100%|███████████████████████████████████████████| 26/26 [00:07<00:00,  3.47it/s]
100%|███████████████████████████████████████████| 26/26 [00:05<00:00,  4.99it/s]


In [48]:
len(dataframe)

6

In [54]:
list_topK = reversed([25, 50, 75, 100, 125, 150])#, 175, 200])

In [55]:
df_scores = pd.DataFrame(dataframe, index=["rank error (top {})".format(topK) for topK in list_topK]).transpose()

In [56]:
df_scores

Unnamed: 0,rank error (top 150),rank error (top 125),rank error (top 100),rank error (top 75),rank error (top 50),rank error (top 25)
0,4.74 (0.72),4.19 (0.63),3.49 (0.51),2.20 (0.48),1.53 (0.31),0.55 (0.04)
1,5.62 (0.06),3.76 (0.02),3.38 (0.01),2.70 (0.02),2.90 (0.03),1.75 (0.06)
2,2.94 (0.20),2.32 (0.14),1.90 (0.03),1.54 (0.02),0.84 (0.00),0.96 (0.00)
3,5.76 (0.28),4.86 (0.38),4.43 (0.33),4.00 (0.37),3.60 (0.27),2.92 (0.14)
4,1.00 (0.03),0.91 (0.00),0.90 (0.00),1.12 (0.00),1.68 (0.00),1.92 (0.00)
5,0.13 (0.00),0.03 (0.00),0.02 (0.00),0.03 (0.00),0.04 (0.00),0.08 (0.00)
6,1.81 (0.11),0.94 (0.07),0.43 (0.02),0.00 (0.00),0.00 (0.00),0.00 (0.00)
7,2.70 (0.07),2.66 (0.01),2.59 (0.01),2.65 (0.02),1.66 (0.00),1.00 (0.00)
8,6.17 (0.71),6.20 (0.57),4.98 (0.44),3.63 (0.39),3.04 (0.23),1.44 (0.00)
9,4.20 (0.19),3.47 (0.14),1.87 (0.16),1.41 (0.17),1.10 (0.17),0.65 (0.22)


In [57]:
df_scores.to_clipboard()