In [None]:
import matplotlib.pyplot as plt
from pathlib import Path
from collections import defaultdict
import numpy as np
import re
from datetime import timedelta

In [None]:
def to_seconds(s):
    hh, mm, ss = re.match(r'(\d{2}):(\d{2}):(\d{2})', s).groups()
    return timedelta(hours=int(hh), minutes=int(mm), seconds=int(ss)).seconds

def to_gb(mb):
    return round(float(mb) / 1000, 2)

def to_rate(i):
    return round(float(i), 2)

In [None]:
def read_results_file(file):
    with open(file) as f:
        data = {k.strip(): v.strip() for k, v in (l.split('|') for l in f)}
    
    data['index_size'] = to_gb(data['index_size'].split(' ', 1)[0])
    data['build_time'] = to_seconds(data['build_time'].split(' ', 1)[0])
    data['dquery_throughput'] = [to_rate(x) for x in data['dquery_throughput'].split(']', 1)[0].strip('[').split(', ')]
    data['rquery_throughput'] = [to_rate(x) for x in data['rquery_throughput'].split(']', 1)[0].strip('[').split(', ')]
    return data

In [None]:
RESULTS_FOLDER = Path('../results')

In [None]:
LABELS = {
    'index_size': {
        'title': 'Index Size',
        'ylabel': 'Memory (GB)',
        'xlabel': '#Points',
        'x': [250_000, 2_500_000, 25_000_000, 250_000_000],
        'xticks': ['0.25m', '2.5m', '25m', '250m'],
    },
    'build_time': {
        'title': 'Build Time',
        'ylabel': 'Build time (s)',
        'xlabel': '#Points',
        'x': [250_000, 2_500_000, 25_000_000, 250_000_000],
        'xticks': ['0.25m', '2.5m', '25m', '250m']
    },
    'dquery_throughput_a': {
        'title': 'Distance query throughput',
        'ylabel': 'Throughput (queries/s)',
        'xlabel': '#Points',
        'x': [250_000, 2_500_000, 25_000_000, 250_000_000],
        'xticks': ['0.25m', '2.5m', '25m', '250m']
    },
    'dquery_throughput_b': {
        'title': 'Distance query throughput',
        'ylabel': 'Throughput (queries/s)',
        'xlabel': 'Selectivity',
        'x': [0.0001, 0.001, 0.01, 0.1, 1.0],
        'xticks': ['0.0001', '0.001', '0.01', '0.1', '1.0']
    },
    'rquery_throughput_a': {
        'title': 'Range query throughput',
        'ylabel': 'Throughput (queries/s)',
        'xlabel': '#Points',
        'x': [250_000, 2_500_000, 25_000_000, 250_000_000],
        'xticks': ['0.25m', '2.5m', '25m', '250m']
    },
    'rquery_throughput_b': {
        'title': 'Range query throughput',
        'ylabel': 'Throughput (queries/s)',
        'xlabel': 'Selectivity',
        'x': [0.0001, 0.001, 0.01, 0.1, 1.0],
        'xticks': ['0.0001', '0.001', '0.01', '0.1', '1.0']
    },
}

In [None]:
# Default colors for matplotlib subplots. Choose to match other plots.
COLORS = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']

# 1. Taxi experiments

In [None]:
def _parse_results_file(file):
    with open(file) as f:
        data = {k.strip(): v.strip() for k, v in (l.split('|') for l in f)}
    
    data['index_size'] = to_gb(data['index_size'].split(' ', 1)[0])
    data['build_time'] = to_seconds(data['build_time'].split(' ', 1)[0])
    data['dquery_throughput'] = [to_rate(x) for x in data['dquery_throughput'].split(']', 1)[0].strip('[').split(', ')]
    data['rquery_throughput'] = [to_rate(x) for x in data['rquery_throughput'].split(']', 1)[0].strip('[').split(', ')]
    return data

def parse_results(idx, name, n_points):
    dicts = [_parse_results_file(f) for f in RESULTS_FOLDER.glob(f'run*/*__{idx}_{name}-{n_points}.txt')]

    return {
        'index_size': np.mean([d['index_size'] for d in dicts]),
        'build_time': np.mean([d['build_time'] for d in dicts]),
        'dquery_throughput': np.mean([d['dquery_throughput'] for d in dicts], axis=0),
        'rquery_throughput': np.mean([d['rquery_throughput'] for d in dicts], axis=0),
    }   

In [None]:
def get_experiment_results(name, idxs=['geos_strtree', 'geos_quadtree', 's2_pointindex'], idx_sizes=['0_25m', '2_5m', '25m', '250m']):
    results = defaultdict(lambda: defaultdict(list))

    for idx in idxs:
        for size in idx_sizes:
            data = parse_results(idx, name, size)
            results['index_size'][idx].append(data['index_size'])
            results['build_time'][idx].append(data['build_time'])

            if size != '250m':
                results['dquery_throughput_a'][idx].extend(data['dquery_throughput'])
                results['rquery_throughput_a'][idx].extend(data['rquery_throughput'])
            else:
                results['dquery_throughput_a'][idx].append(data['dquery_throughput'][-2])
                results['rquery_throughput_a'][idx].append(data['rquery_throughput'][-2])
                results['dquery_throughput_b'][idx].extend(data['dquery_throughput'])
                results['rquery_throughput_b'][idx].extend(data['rquery_throughput'])
    
    return results

## 1.1 nyc-taxi baseline

In [None]:
def plot_taxi(ax, results, metric):
    X = LABELS[metric]['x']

    ax.plot(X, results[metric]['geos_strtree'], marker='s', label='strtree')
    ax.plot(X, results[metric]['geos_quadtree'], marker='^', label='quadtree')
    ax.plot(X, results[metric]['s2_pointindex'], marker='v', label='s2pointindex')

    ax.set_title(LABELS[metric]['title'])
    ax.set_xticks(X, LABELS[metric]['xticks'])
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_xlabel(LABELS[metric]['xlabel'])
    ax.set_ylabel(LABELS[metric]['ylabel'])
    # ax.legend()

In [None]:
results_nyctaxi = get_experiment_results('nyc-taxi')

fig, axs = plt.subplots(3, 2, figsize=(8, 12))

for ax, metric in zip(axs.flat, ['index_size', 'build_time', 'dquery_throughput_a', 'dquery_throughput_b', 'rquery_throughput_a', 'rquery_throughput_b']):
    plot_taxi(ax, results_nyctaxi, metric)

handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper right')
fig.tight_layout()

## 1.2 Baseline comparisons

In [None]:
def plot_taxi_exps(ax, results, metric, idx):
    X = LABELS[metric]['x']

    for exp, marker in zip(results.keys(), ['s', '1', '2', '3', '4']):
        ax.plot(X, results[exp][metric][idx], marker=marker, label=exp, linewidth=1)

    ax.set_title(f"{LABELS[metric]['title']} - {idx}")
    ax.set_xticks(X, LABELS[metric]['xticks'])
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_xlabel(LABELS[metric]['xlabel'])
    ax.set_ylabel(LABELS[metric]['ylabel'])
    # ax.legend()

In [None]:
exp_results = {
    exp: get_experiment_results(exp)
    for exp in ['nyc-taxi', 'syracuse-taxi', 'aogaki-taxi', 'germany-taxi', 'japan-taxi']
}

fig, axs = plt.subplots(6, 3, figsize=(12, 24))

for row, metric in zip(axs, ['index_size', 'build_time', 'dquery_throughput_a', 'dquery_throughput_b', 'rquery_throughput_a', 'rquery_throughput_b']):
    for ax, idx in zip(row, ['geos_strtree', 'geos_quadtree', 's2_pointindex']):
        plot_taxi_exps(ax, exp_results, metric, idx)
        
handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', ncol=5, bbox_to_anchor=(0.5, 1.015))
fig.tight_layout()

# 2. Synthetic datasets

In [None]:
def plot_synthetic(ax, results, metric, idx):
    X = list(results.keys())
    ax.bar(X, [results[exp][metric][idx][0] for exp in X], label=X, color=COLORS, width=0.3, edgecolor='black')
    ax.set_title(f"{LABELS[metric]['title']} - {idx}")
    ax.set_xticks([])
    ax.set_ylabel(LABELS[metric]['ylabel'])
    ax.grid(True, linewidth=0.5, linestyle='--')

In [None]:
exp_results = {
    exp: get_experiment_results(exp, idx_sizes=['25m'])
    for exp in ['nyc-taxi', 'synthetic-nyc', 'synthetic-tokyo', 'synthetic-tokyo-nyc', 'synthetic-delhi', 'synthetic-delhi-nyc', 'synthetic-saopaolo', 'synthetic-saopaolo-nyc']
}

fig, axs = plt.subplots(4, 3, figsize=(12, 12))

for row, metric in zip(axs, ['index_size', 'build_time', 'dquery_throughput_a', 'rquery_throughput_a']):
    for ax, idx in zip(row, ['geos_strtree', 'geos_quadtree', 's2_pointindex']):
        plot_synthetic(ax, exp_results, metric, idx)

handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', ncol=4, bbox_to_anchor=(0.5, 1.05))
fig.tight_layout()