In [1]:
import json
from texttable import Texttable
import latextable
from tabulate import tabulate
import sys
file_directory = "../"
sys.path.append(file_directory)
from generateDataset import generateDatasetFromResults
from metric import statisticalSignificanceByTemplate
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import MultipleLocator
from scipy.optimize import curve_fit
from scipy.stats import pearsonr
from matplotlib.ticker import FormatStrFormatter
from matplotlib.lines import Line2D
import statistics

# Datasets

In [2]:
shapeIndexPathResult = "../results/standard/shape_index_result.json"
shapeIndexPathSummary = "../results/standard/summary_shape_index_result.json"
shapeIndexDataset = generateDatasetFromResults(shapeIndexPathResult, shapeIndexPathSummary, "shape index")

In [3]:
typeIndexLdpPathResult = "../results/standard/type_index_ldp_result.json"
typeIndexLdpPathSummary = "../results/standard/summary_type_index_ldp_result.json"
typeIndexLdpDataset = generateDatasetFromResults(typeIndexLdpPathResult, typeIndexLdpPathSummary, "type index and ldp")

In [4]:
shapeIndex20PathResult = "../results/shape-entry-20-percent/shape_index_result.json"
shapeIndex20PathSummary = "../results/shape-entry-20-percent/summary_shape_index_result.json"
shapeIndex20Dataset = generateDatasetFromResults(shapeIndex20PathResult, shapeIndex20PathSummary, "shape index entries 20%")

In [5]:
shapeIndex50PathResult = "../results/shape-entry-50-percent/shape_index_result.json"
shapeIndex50PathSummary = "../results/shape-entry-50-percent/summary_shape_index_result.json"
shapeIndex50Dataset = generateDatasetFromResults(shapeIndex50PathResult, shapeIndex50PathSummary, "shape index entries 50%")

In [6]:
shapeIndex80PathResult = "../results/shape-entry-80-percent/shape_index_result.json"
shapeIndex80PathSummary = "../results/shape-entry-80-percent/summary_shape_index_result.json"
shapeIndex80Dataset = generateDatasetFromResults(shapeIndex80PathResult, shapeIndex80PathSummary, "shape index entries 80%")

In [25]:
shapeIndex0PathResult = "../results/shape-index-0-percent/shape_index_result.json"
shapeIndex0PathSummary = "../results/shape-index-0-percent/summary_shape_index_result.json"
shapeIndex0NetworkDataset = generateDatasetFromResults(shapeIndex0PathResult, shapeIndex0PathSummary, "shape index network 0%")

In [26]:
shapeIndex20PathResult = "../results/shape-index-20-percent/shape_index_result.json"
shapeIndex20PathSummary = "../results/shape-index-20-percent/summary_shape_index_result.json"
shapeIndex20NetworkDataset = generateDatasetFromResults(shapeIndex20PathResult, shapeIndex20PathSummary, "shape index network 20%")

In [27]:
shapeIndex50PathResult = "../results/shape-index-50-percent/shape_index_result.json"
shapeIndex50PathSummary = "../results/shape-index-50-percent/summary_shape_index_result.json"
shapeIndex50NetworkDataset = generateDatasetFromResults(shapeIndex50PathResult, shapeIndex50PathSummary, "shape index network 50%")

In [28]:
shapeIndex80PathResult = "../results/shape-index-80-percent/shape_index_result.json"
shapeIndex80PathSummary = "../results/shape-index-80-percent/summary_shape_index_result.json"
shapeIndex80NetworkDataset = generateDatasetFromResults(shapeIndex80PathResult, shapeIndex80PathSummary, "shape index network 80%")

# Statistical significance

In [7]:
head = ["query template", "relation execution time", "p-value"]

In [8]:
def generateTableInfo(results):  
    rows = []
    p_value_significant = 0.05
    for query_template, value in results.items():
        relation  = "-"
        p_value= "-"
        if value["greater"] is not None:
            if value["different"] > p_value_significant:
                relation = "similar"
                p_value = f"{value["greater"]:.2E} (different distribution)"
            elif value["greater"] < p_value_significant:
                relation = "greater"
                p_value = f"{value["greater"]:.2E}"
            elif value["lesser"] < p_value_significant:
                relation = "lesser"
                p_value = f"{value["lesser"]:.2E}"
        if relation == "-":
            continue
        row = [
            query_template,
            relation,
            p_value,
              ]
        rows.append(row)
    return rows

## Shape index vs the state of the art

In [9]:
results = {}

In [10]:
for template, execution_times in shapeIndexDataset.executionTime.items():
    
    (p_value_greater, p_value_different, p_value_lesser) = statisticalSignificanceByTemplate(
        execution_times,
        typeIndexLdpDataset.executionTime[template]
    )

    results[template] = {
        "greater": p_value_greater,
        "lesser": p_value_lesser,
        "different":p_value_different,
    }

In [11]:
rows = generateTableInfo(results)

In [12]:
print(tabulate(rows, headers=head, tablefmt="github"))

| query template         | relation execution time   | p-value                           |
|------------------------|---------------------------|-----------------------------------|
| interactive-discover-1 | lesser                    | 1.14E-36                          |
| interactive-discover-2 | lesser                    | 4.42E-04                          |
| interactive-discover-3 | similar                   | 7.47E-01 (different distribution) |
| interactive-discover-4 | lesser                    | 2.07E-17                          |
| interactive-discover-5 | lesser                    | 5.58E-03                          |
| interactive-discover-6 | similar                   | 2.56E-01 (different distribution) |
| interactive-discover-7 | similar                   | 7.83E-01 (different distribution) |
| interactive-short-1    | lesser                    | 1.12E-83                          |
| interactive-short-4    | greater                   | 3.76E-22                          |

## shape index 20 percent entries vs shape index

In [13]:
results = {}

In [14]:
for template, execution_times in shapeIndex20Dataset.executionTime.items():
    
    (p_value_greater, p_value_different, p_value_lesser) = statisticalSignificanceByTemplate(
        execution_times,
        shapeIndexDataset.executionTime[template]
    )

    results[template] = {
        "greater": p_value_greater,
        "lesser": p_value_lesser,
        "different":p_value_different,
    }

In [15]:
rows = generateTableInfo(results)

In [16]:
print(tabulate(rows, headers=head, tablefmt="github"))

| query template         | relation execution time   | p-value                           |
|------------------------|---------------------------|-----------------------------------|
| interactive-discover-1 | similar                   | 3.53E-01 (different distribution) |
| interactive-discover-2 | lesser                    | 3.62E-23                          |
| interactive-discover-3 | lesser                    | 6.74E-05                          |
| interactive-discover-4 | similar                   | 7.71E-01 (different distribution) |
| interactive-discover-5 | greater                   | 1.25E-02                          |
| interactive-discover-6 | lesser                    | 8.19E-16                          |
| interactive-discover-7 | lesser                    | 2.22E-16                          |
| interactive-short-1    | greater                   | 1.84E-31                          |
| interactive-short-4    | lesser                    | 1.17E-02                          |

## shape index 50 percent entries vs shape index

In [17]:
results = {}

In [18]:
for template, execution_times in shapeIndex50Dataset.executionTime.items():
    
    (p_value_greater, p_value_different, p_value_lesser) = statisticalSignificanceByTemplate(
        execution_times,
        shapeIndexDataset.executionTime[template]
    )

    results[template] = {
        "greater": p_value_greater,
        "lesser": p_value_lesser,
        "different":p_value_different,
    }

In [19]:
rows = generateTableInfo(results)

In [20]:
print(tabulate(rows, headers=head, tablefmt="github"))

| query template         | relation execution time   | p-value                           |
|------------------------|---------------------------|-----------------------------------|
| interactive-discover-1 | greater                   | 1.07E-04                          |
| interactive-discover-2 | lesser                    | 1.69E-17                          |
| interactive-discover-3 | similar                   | 9.06E-01 (different distribution) |
| interactive-discover-4 | greater                   | 1.52E-03                          |
| interactive-discover-5 | greater                   | 1.29E-04                          |
| interactive-discover-6 | lesser                    | 4.65E-17                          |
| interactive-discover-7 | lesser                    | 4.49E-17                          |
| interactive-short-1    | greater                   | 1.12E-83                          |
| interactive-short-4    | lesser                    | 1.22E-08                          |

## shape index 80 percent entries vs shape index

In [21]:
results = {}

In [22]:
for template, execution_times in shapeIndex80Dataset.executionTime.items():
    
    (p_value_greater, p_value_different, p_value_lesser) = statisticalSignificanceByTemplate(
        execution_times,
        shapeIndexDataset.executionTime[template]
    )

    results[template] = {
        "greater": p_value_greater,
        "lesser": p_value_lesser,
        "different":p_value_different,
    }

In [23]:
rows = generateTableInfo(results)

In [24]:
print(tabulate(rows, headers=head, tablefmt="github"))

| query template         | relation execution time   | p-value                           |
|------------------------|---------------------------|-----------------------------------|
| interactive-discover-1 | similar                   | 5.01E-01 (different distribution) |
| interactive-discover-2 | lesser                    | 1.69E-18                          |
| interactive-discover-3 | lesser                    | 3.97E-03                          |
| interactive-discover-4 | similar                   | 4.72E-01 (different distribution) |
| interactive-discover-5 | similar                   | 1.00E-01 (different distribution) |
| interactive-discover-6 | lesser                    | 4.44E-17                          |
| interactive-discover-7 | lesser                    | 4.44E-17                          |
| interactive-short-1    | greater                   | 1.12E-83                          |
| interactive-short-4    | lesser                    | 3.08E-13                          |

## shape index 0 percent network vs shape index

In [41]:
results = {}

In [42]:
for template, execution_times in shapeIndex0NetworkDataset.executionTime.items():
    
    (p_value_greater, p_value_different, p_value_lesser) = statisticalSignificanceByTemplate(
        execution_times,
        shapeIndexDataset.executionTime[template]
    )

    results[template] = {
        "greater": p_value_greater,
        "lesser": p_value_lesser,
        "different":p_value_different,
    }

In [43]:
rows = generateTableInfo(results)

In [44]:
print(tabulate(rows, headers=head, tablefmt="github"))

| query template         | relation execution time   | p-value                           |
|------------------------|---------------------------|-----------------------------------|
| interactive-discover-1 | greater                   | 3.76E-37                          |
| interactive-discover-2 | greater                   | 1.24E-02                          |
| interactive-discover-3 | similar                   | 3.72E-01 (different distribution) |
| interactive-discover-4 | greater                   | 1.53E-16                          |
| interactive-discover-5 | similar                   | 2.23E-01 (different distribution) |
| interactive-discover-6 | lesser                    | 5.35E-06                          |
| interactive-discover-7 | lesser                    | 4.59E-05                          |
| interactive-short-1    | greater                   | 1.44E-83                          |
| interactive-short-4    | lesser                    | 5.38E-18                          |

## shape index 20 percent network vs shape index

In [29]:
results = {}

In [30]:
for template, execution_times in shapeIndex20NetworkDataset.executionTime.items():
    
    (p_value_greater, p_value_different, p_value_lesser) = statisticalSignificanceByTemplate(
        execution_times,
        shapeIndexDataset.executionTime[template]
    )

    results[template] = {
        "greater": p_value_greater,
        "lesser": p_value_lesser,
        "different":p_value_different,
    }

In [31]:
rows = generateTableInfo(results)

In [32]:
print(tabulate(rows, headers=head, tablefmt="github"))

| query template         | relation execution time   | p-value                           |
|------------------------|---------------------------|-----------------------------------|
| interactive-discover-1 | greater                   | 8.73E-36                          |
| interactive-discover-2 | greater                   | 2.16E-02                          |
| interactive-discover-3 | similar                   | 2.28E-01 (different distribution) |
| interactive-discover-4 | greater                   | 6.38E-17                          |
| interactive-discover-5 | similar                   | 7.28E-02 (different distribution) |
| interactive-discover-6 | lesser                    | 9.28E-06                          |
| interactive-discover-7 | lesser                    | 1.97E-04                          |
| interactive-short-1    | greater                   | 1.12E-83                          |
| interactive-short-4    | lesser                    | 1.30E-13                          |

## shape index 50 percent network vs shape index

In [33]:
results = {}

In [34]:
for template, execution_times in shapeIndex50NetworkDataset.executionTime.items():
    
    (p_value_greater, p_value_different, p_value_lesser) = statisticalSignificanceByTemplate(
        execution_times,
        shapeIndexDataset.executionTime[template]
    )

    results[template] = {
        "greater": p_value_greater,
        "lesser": p_value_lesser,
        "different":p_value_different,
    }

In [35]:
rows = generateTableInfo(results)

In [36]:
print(tabulate(rows, headers=head, tablefmt="github"))

| query template         | relation execution time   | p-value                           |
|------------------------|---------------------------|-----------------------------------|
| interactive-discover-1 | similar                   | 3.71E-01 (different distribution) |
| interactive-discover-2 | similar                   | 3.92E-01 (different distribution) |
| interactive-discover-3 | similar                   | 6.65E-01 (different distribution) |
| interactive-discover-4 | similar                   | 5.72E-01 (different distribution) |
| interactive-discover-5 | similar                   | 2.73E-01 (different distribution) |
| interactive-discover-6 | similar                   | 5.45E-01 (different distribution) |
| interactive-discover-7 | similar                   | 3.68E-01 (different distribution) |
| interactive-short-1    | similar                   | 7.26E-01 (different distribution) |
| interactive-short-4    | lesser                    | 7.12E-05                          |

## shape index 80 percent network vs shape index

In [37]:
results = {}

In [38]:
for template, execution_times in shapeIndex80NetworkDataset.executionTime.items():
    
    (p_value_greater, p_value_different, p_value_lesser) = statisticalSignificanceByTemplate(
        execution_times,
        shapeIndexDataset.executionTime[template]
    )

    results[template] = {
        "greater": p_value_greater,
        "lesser": p_value_lesser,
        "different":p_value_different,
    }

In [39]:
rows = generateTableInfo(results)

In [40]:
print(tabulate(rows, headers=head, tablefmt="github"))

| query template         | relation execution time   | p-value                           |
|------------------------|---------------------------|-----------------------------------|
| interactive-discover-1 | similar                   | 6.50E-01 (different distribution) |
| interactive-discover-2 | similar                   | 5.17E-01 (different distribution) |
| interactive-discover-3 | similar                   | 4.38E-01 (different distribution) |
| interactive-discover-4 | similar                   | 6.35E-01 (different distribution) |
| interactive-discover-5 | similar                   | 7.73E-01 (different distribution) |
| interactive-discover-6 | similar                   | 4.67E-01 (different distribution) |
| interactive-discover-7 | similar                   | 4.48E-01 (different distribution) |
| interactive-short-1    | similar                   | 8.65E-01 (different distribution) |
| interactive-short-4    | lesser                    | 8.40E-03                          |