In [1]:
import argparse
from collections import defaultdict
import json
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

from agq.metrics import get_rank_error, get_hit_ratio, get_ndcg, get_spearman
from constants import DATA_STREAM_FOLDER, RANKING_FOLDER

In [2]:
args = argparse.Namespace(**{
    'path_to_scenario': '../synthetics/scenarios/scenario_16/',  #change me
    'streams_root': 'card_10000_202303010000YY',
    'aggregation_level': 'file',
    'k': -1
})

metrics = ['rank_error']

In [3]:
# load params
with open(f'{args.path_to_scenario}/params.json', 'r') as f:
    params = json.load(f)
    params.update(vars(args))
    args = argparse.Namespace(**params)

### Cardinality 1000 results

In [4]:
# will need to be changed after data is generated
stream_roots = ['card_1000_20230302165304'] # from your local filesystem; name depends on when run
scenario_paths = ['../synthetics/scenarios/scenario_{}'.format(i) for i in range(7)]  # indices depend on order in which they are run
#########


res = {} 
for root in stream_roots:
    args.streams_root = root
    for met in metrics:
        dataframe = defaultdict(list)
        for scenario in scenario_paths:
            args.path_to_scenario = scenario
            for topK in [50]:
                streams_filenames = os.listdir(f'.{args.data_folder}{DATA_STREAM_FOLDER}/')
                if args.streams_root != '':
                    streams_filenames = [e for e in streams_filenames if args.streams_root in e]

                streams_filenames = sorted(streams_filenames)

                scores = defaultdict(list)
                for filename in tqdm(streams_filenames):
                    stream_name = filename.split('.csv')[0]
                    truth_ranking = pd.read_csv(f'.{args.data_folder}{RANKING_FOLDER}/{filename}')

                    filename_scores, counter = defaultdict(list), 0
                    estimated_ranking = pd.read_csv(
                        f'{args.path_to_scenario}/{stream_name}_ranking_0.csv')

                    if topK != -1:
                        estimated_ranking = estimated_ranking.iloc[:topK - 1]

                    filename_scores['rank_error'].append(
                        get_rank_error(truth_ranking=truth_ranking, estimated_ranking=estimated_ranking, weighted=False)
                    )
                    filename_scores['spearman'].append(
                        get_spearman(truth_ranking=truth_ranking, estimated_ranking=estimated_ranking, weighted=False)
                    )

                    for k, v in filename_scores.items():
                        v_array = np.array(v)
                        scores[k].append((v_array.mean(), v_array.std()))

                dataframe[met].append([mean for mean, std in scores[met]])
        res[(root, met)] = dataframe
        
for s in stream_roots:
    for m in metrics:
        d = pd.DataFrame(res[(s, m)][m], index  =['UP-F', 'UP-T', 'UN-F','UN-T', 'PURE', 'CDF', "CM"] ).transpose().mean(axis=0)
        print(d[['PURE', 'CDF', 'UN-F','UP-F',   'UN-T',  'UP-T', "CM"]])
        d = pd.DataFrame(res[(s, m)][m], index  =['UP-F', 'UP-T', 'UN-F','UN-T', 'PURE', 'CDF', "CM"] ).transpose().std(axis=0)
        print(d[['PURE', 'CDF', 'UN-F','UP-F',   'UN-T',  'UP-T', "CM"]])


100%|██████████| 10/10 [00:00<00:00, 78.83it/s]
100%|██████████| 10/10 [00:00<00:00, 89.98it/s]
100%|██████████| 10/10 [00:00<00:00, 85.73it/s]
100%|██████████| 10/10 [00:00<00:00, 87.34it/s]
100%|██████████| 10/10 [00:00<00:00, 87.60it/s]
100%|██████████| 10/10 [00:00<00:00, 89.93it/s]
100%|██████████| 10/10 [00:00<00:00, 85.60it/s]

PURE    5.189796
CDF     3.763265
UN-F    2.412245
UP-F    2.059184
UN-T    2.157143
UP-T    2.193878
CM      5.400000
dtype: float64
PURE    0.846607
CDF     0.800597
UN-F    0.899545
UP-F    0.396483
UN-T    0.695147
UP-T    0.679949
CM      1.139012
dtype: float64





### Cardinality 10,000 results

In [5]:
# second table (card 10000) - as before, will need to be changed after data is generated
stream_roots = ['card_10000_20230302165328']
scenario_paths = ['../synthetics/scenarios/scenario_{}'.format(i) for i in range(6)]  # indices depend on order in which they are run
#########


res = {} 
for root in stream_roots:
    args.streams_root = root
    for met in metrics:
        dataframe = defaultdict(list)
        for scenario in scenario_paths:
            args.path_to_scenario = scenario
            for topK in [50]:
                streams_filenames = os.listdir(f'.{args.data_folder}{DATA_STREAM_FOLDER}/')
                if args.streams_root != '':
                    streams_filenames = [e for e in streams_filenames if args.streams_root in e]

                streams_filenames = sorted(streams_filenames)

                scores = defaultdict(list)
                for filename in tqdm(streams_filenames):
                    stream_name = filename.split('.csv')[0]
                    truth_ranking = pd.read_csv(f'.{args.data_folder}{RANKING_FOLDER}/{filename}')

                    filename_scores, counter = defaultdict(list), 0
                    estimated_ranking = pd.read_csv(
                        f'{args.path_to_scenario}/{stream_name}_ranking_0.csv')

                    if topK != -1:
                        estimated_ranking = estimated_ranking.iloc[:topK - 1]

                    filename_scores['rank_error'].append(
                        get_rank_error(truth_ranking=truth_ranking, estimated_ranking=estimated_ranking, weighted=False)
                    )
                    filename_scores['spearman'].append(
                        get_spearman(truth_ranking=truth_ranking, estimated_ranking=estimated_ranking, weighted=False)
                    )

                    for k, v in filename_scores.items():
                        v_array = np.array(v)
                        scores[k].append((v_array.mean(), v_array.std()))

                dataframe[met].append([mean for mean, std in scores[met]])
        res[(root, met)] = dataframe
        
for s in stream_roots:
    for m in metrics:
        d = pd.DataFrame(res[(s, m)][m], index  =['UP-F', 'UP-T', 'UN-F','UN-T', 'PURE', 'CDF'] ).transpose().mean(axis=0)
        print(d[['PURE', 'CDF', 'UN-F','UP-F',   'UN-T',  'UP-T']])#.to_latex())
        d = pd.DataFrame(res[(s, m)][m], index  =['UP-F', 'UP-T', 'UN-F','UN-T', 'PURE', 'CDF'] ).transpose().std(axis=0)
        print(d[['PURE', 'CDF', 'UN-F','UP-F',   'UN-T',  'UP-T']])#.to_latex())


100%|██████████| 10/10 [00:00<00:00, 62.05it/s]
100%|██████████| 10/10 [00:00<00:00, 54.98it/s]
100%|██████████| 10/10 [00:00<00:00, 58.58it/s]
100%|██████████| 10/10 [00:00<00:00, 55.98it/s]
100%|██████████| 10/10 [00:00<00:00, 57.32it/s]
100%|██████████| 10/10 [00:00<00:00, 57.96it/s]

PURE    5.483673
CDF     5.644898
UN-F    4.279592
UP-F    3.951020
UN-T    3.875510
UP-T    4.108163
dtype: float64
PURE    1.017777
CDF     1.495584
UN-F    1.498794
UP-F    1.606946
UN-T    1.100421
UP-T    1.217594
dtype: float64





### Merge results

In [6]:
#  MERGE RESULTS - scale:100 - as before, will need to be changed after data is generated
stream_roots= ['card_1000_20230627145025']
scenario_paths = ['../synthetics/scenarios/scenario_18']
#########


res = {} 
for root in stream_roots:
    args.streams_root = root
    for met in metrics:
        dataframe = defaultdict(list)
        for scenario in scenario_paths:
            args.path_to_scenario = scenario
            for topK in [50]:
                streams_filenames = os.listdir(f'.{args.data_folder}{DATA_STREAM_FOLDER}/')
                if args.streams_root != '':
                    streams_filenames = [e for e in streams_filenames if args.streams_root in e]

                streams_filenames = sorted(streams_filenames)

                scores = defaultdict(list)
                for filename in tqdm(streams_filenames):
                    stream_name = filename.split('.csv')[0]
                    truth_ranking = pd.read_csv(f'.{args.data_folder}{RANKING_FOLDER}/{filename}')

                    filename_scores, counter = defaultdict(list), 0
                    estimated_ranking = pd.read_csv(
                        f'{args.path_to_scenario}/{stream_name}_ranking_0.csv')

                    if topK != -1:
                        estimated_ranking = estimated_ranking.iloc[:topK - 1]

                    filename_scores['rank_error'].append(
                        get_rank_error(truth_ranking=truth_ranking, estimated_ranking=estimated_ranking, weighted=False)
                    )
                    filename_scores['spearman'].append(
                        get_spearman(truth_ranking=truth_ranking, estimated_ranking=estimated_ranking, weighted=False)
                    )

                    for k, v in filename_scores.items():
                        v_array = np.array(v)
                        scores[k].append((v_array.mean(), v_array.std()))

                dataframe[met].append([mean for mean, std in scores[met]])
        res[(root, met)] = dataframe
        
for s in stream_roots:
    for m in metrics:
        d = pd.DataFrame(res[(s, m)][m], index  =['MERGE'] ).transpose().mean(axis=0)
        print(d[['MERGE']])
        d = pd.DataFrame(res[(s, m)][m], index  =['MERGE'] ).transpose().std(axis=0)
        print(d[['MERGE']])


100%|██████████| 10/10 [00:00<00:00, 74.85it/s]

MERGE    1.655102
dtype: float64
MERGE    0.222408
dtype: float64



