In [1]:
import json
from texttable import Texttable
import latextable
from tabulate import tabulate
import sys
file_directory = "../"
sys.path.append(file_directory)
from generateDataset import generateDatasetFromResults
from metric import statisticalSignificanceByTemplate
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import MultipleLocator
from scipy.optimize import curve_fit
from scipy.stats import pearsonr
from matplotlib.ticker import FormatStrFormatter
from matplotlib.lines import Line2D
import statistics
from plotsVariation import calculatePercentageReductionSeries, statisticTemplateMetric

# Datasets

In [2]:
shapeIndexPathResult = "../results/standard/shape_index_result.json"
shapeIndexPathSummary = "../results/standard/summary_shape_index_result.json"
shapeIndexDataset = generateDatasetFromResults(shapeIndexPathResult, shapeIndexPathSummary, "shape index")

In [3]:
typeIndexLdpPathResult = "../results/standard/type_index_ldp_result.json"
typeIndexLdpPathSummary = "../results/standard/summary_type_index_ldp_result.json"
typeIndexLdpDataset = generateDatasetFromResults(typeIndexLdpPathResult, typeIndexLdpPathSummary, "type index and ldp")

In [4]:
shapeIndex20PathResult = "../results/shape-entry-20-percent/shape_index_result.json"
shapeIndex20PathSummary = "../results/shape-entry-20-percent/summary_shape_index_result.json"
shapeIndex20Dataset = generateDatasetFromResults(shapeIndex20PathResult, shapeIndex20PathSummary, "shape index entries 20%")

In [5]:
shapeIndex50PathResult = "../results/shape-entry-50-percent/shape_index_result.json"
shapeIndex50PathSummary = "../results/shape-entry-50-percent/summary_shape_index_result.json"
shapeIndex50Dataset = generateDatasetFromResults(shapeIndex50PathResult, shapeIndex50PathSummary, "shape index entries 50%")

In [6]:
shapeIndex80PathResult = "../results/shape-entry-80-percent/shape_index_result.json"
shapeIndex80PathSummary = "../results/shape-entry-80-percent/summary_shape_index_result.json"
shapeIndex80Dataset = generateDatasetFromResults(shapeIndex80PathResult, shapeIndex80PathSummary, "shape index entries 80%")

In [7]:
shapeIndex0PathResult = "../results/shape-index-0-percent/shape_index_result.json"
shapeIndex0PathSummary = "../results/shape-index-0-percent/summary_shape_index_result.json"
shapeIndex0NetworkDataset = generateDatasetFromResults(shapeIndex0PathResult, shapeIndex0PathSummary, "shape index network 0%")

In [8]:
shapeIndex20PathResult = "../results/shape-index-20-percent/shape_index_result.json"
shapeIndex20PathSummary = "../results/shape-index-20-percent/summary_shape_index_result.json"
shapeIndex20NetworkDataset = generateDatasetFromResults(shapeIndex20PathResult, shapeIndex20PathSummary, "shape index network 20%")

In [9]:
shapeIndex50PathResult = "../results/shape-index-50-percent/shape_index_result.json"
shapeIndex50PathSummary = "../results/shape-index-50-percent/summary_shape_index_result.json"
shapeIndex50NetworkDataset = generateDatasetFromResults(shapeIndex50PathResult, shapeIndex50PathSummary, "shape index network 50%")

In [10]:
shapeIndex80PathResult = "../results/shape-index-80-percent/shape_index_result.json"
shapeIndex80PathSummary = "../results/shape-index-80-percent/summary_shape_index_result.json"
shapeIndex80NetworkDataset = generateDatasetFromResults(shapeIndex80PathResult, shapeIndex80PathSummary, "shape index network 80%")

In [11]:
shapeIndexInnerPathResult = "../results/shape-inner/shape_index_result.json"
shapeIndexInnerPathSummary = "../results/shape-inner/summary_shape_index_result.json"
shapeIndexInnerDataset = generateDatasetFromResults(shapeIndexInnerPathResult, shapeIndexInnerPathSummary, "Dataset shape model")

In [12]:
shapeIndexMinimalPathResult = "../results/shape-minimal/shape_index_result.json"
shapeIndexMinimalPathSummary = "../results/shape-minimal/summary_shape_index_result.json"
shapeIndexMinimalDataset = generateDatasetFromResults(shapeIndexMinimalPathResult, shapeIndexMinimalPathSummary, "Minimal model")

# Statistical significance

In [13]:
head = ["query template", "relation execution time", "p-value", "average ratio HTTP request"]

In [14]:
def generateTableInfo(results, stat_http_req):  
    rows = []
    p_value_significant = 0.05
    for query_template, value in results.items():
        relation  = "-"
        p_value= "-"
        ratio_Http_Req = "-"
        if value["greater"] is not None:
            if value["different"] > p_value_significant:
                relation = "similar"
                p_value = f"{value["greater"]:.2E} (different distribution)"
            elif value["greater"] < p_value_significant:
                relation = "greater"
                p_value = f"{value["greater"]:.2E}"
            elif value["lesser"] < p_value_significant:
                relation = "lesser"
                p_value = f"{value["lesser"]:.2E}"
        if relation == "-":
            continue
        avg_http_req = stat_http_req[query_template]['avg']
        if not np.isnan(avg_http_req):
            ratio_Http_Req = f"{float(avg_http_req):.2f}"
        row = [
            query_template,
            relation,
            p_value,
            ratio_Http_Req
              ]
        rows.append(row)
    return rows

## Shape index vs the state of the art

In [15]:
results = {}

In [16]:
for template, execution_times in shapeIndexDataset.executionTime.items():
    
    (p_value_greater, p_value_different, p_value_lesser) = statisticalSignificanceByTemplate(
        execution_times,
        typeIndexLdpDataset.executionTime[template]
    )

    results[template] = {
        "greater": p_value_greater,
        "lesser": p_value_lesser,
        "different":p_value_different,
    }
reduction_http_req = calculatePercentageReductionSeries(shapeIndexDataset.numberHttpRequest, typeIndexLdpDataset.numberHttpRequest)

stat_http_req = statisticTemplateMetric(reduction_http_req)

In [17]:
rows = generateTableInfo(results, stat_http_req)

In [18]:
print(tabulate(rows, headers=head, tablefmt="github"))

| query template         | relation execution time   | p-value                           |   average ratio HTTP request |
|------------------------|---------------------------|-----------------------------------|------------------------------|
| interactive-discover-1 | lesser                    | 1.14E-36                          |                         0.57 |
| interactive-discover-2 | lesser                    | 4.42E-04                          |                         0.88 |
| interactive-discover-3 | similar                   | 7.47E-01 (different distribution) |                         0.97 |
| interactive-discover-4 | lesser                    | 2.07E-17                          |                         0.65 |
| interactive-discover-5 | lesser                    | 5.58E-03                          |                         0.88 |
| interactive-discover-6 | similar                   | 2.56E-01 (different distribution) |                         1.12 |
| interactive-discover-7

## shape index 20 percent entries vs shape index

In [19]:
results = {}

In [20]:
for template, execution_times in shapeIndex20Dataset.executionTime.items():
    
    (p_value_greater, p_value_different, p_value_lesser) = statisticalSignificanceByTemplate(
        execution_times,
        shapeIndexDataset.executionTime[template]
    )

    results[template] = {
        "greater": p_value_greater,
        "lesser": p_value_lesser,
        "different":p_value_different,
    }
reduction_http_req = calculatePercentageReductionSeries(shapeIndex20Dataset.numberHttpRequest, shapeIndexDataset.numberHttpRequest)

stat_http_req = statisticTemplateMetric(reduction_http_req)

In [21]:
rows = generateTableInfo(results, stat_http_req)

In [22]:
print(tabulate(rows, headers=head, tablefmt="github"))

| query template         | relation execution time   | p-value                           |   average ratio HTTP request |
|------------------------|---------------------------|-----------------------------------|------------------------------|
| interactive-discover-1 | similar                   | 3.53E-01 (different distribution) |                         2.4  |
| interactive-discover-2 | lesser                    | 3.62E-23                          |                         1.09 |
| interactive-discover-3 | lesser                    | 6.74E-05                          |                         1.02 |
| interactive-discover-4 | similar                   | 7.71E-01 (different distribution) |                         1.29 |
| interactive-discover-5 | greater                   | 1.25E-02                          |                         1.09 |
| interactive-discover-6 | lesser                    | 8.19E-16                          |                         0.97 |
| interactive-discover-7

## shape index 50 percent entries vs shape index

In [23]:
results = {}

In [24]:
for template, execution_times in shapeIndex50Dataset.executionTime.items():
    
    (p_value_greater, p_value_different, p_value_lesser) = statisticalSignificanceByTemplate(
        execution_times,
        shapeIndexDataset.executionTime[template]
    )

    results[template] = {
        "greater": p_value_greater,
        "lesser": p_value_lesser,
        "different":p_value_different,
    }

reduction_http_req = calculatePercentageReductionSeries(shapeIndex50Dataset.numberHttpRequest, shapeIndexDataset.numberHttpRequest)

stat_http_req = statisticTemplateMetric(reduction_http_req)

In [25]:
rows = generateTableInfo(results, stat_http_req)

In [26]:
print(tabulate(rows, headers=head, tablefmt="github"))

| query template         | relation execution time   | p-value                           |   average ratio HTTP request |
|------------------------|---------------------------|-----------------------------------|------------------------------|
| interactive-discover-1 | greater                   | 1.07E-04                          |                         2.54 |
| interactive-discover-2 | lesser                    | 1.69E-17                          |                         1.11 |
| interactive-discover-3 | similar                   | 9.06E-01 (different distribution) |                         1.02 |
| interactive-discover-4 | greater                   | 1.52E-03                          |                         1.67 |
| interactive-discover-5 | greater                   | 1.29E-04                          |                         1.11 |
| interactive-discover-6 | lesser                    | 4.65E-17                          |                         0.88 |
| interactive-discover-7

## shape index 80 percent entries vs shape index

In [27]:
results = {}

In [28]:
for template, execution_times in shapeIndex80Dataset.executionTime.items():
    
    (p_value_greater, p_value_different, p_value_lesser) = statisticalSignificanceByTemplate(
        execution_times,
        shapeIndexDataset.executionTime[template]
    )

    results[template] = {
        "greater": p_value_greater,
        "lesser": p_value_lesser,
        "different":p_value_different,
    }
reduction_http_req = calculatePercentageReductionSeries(shapeIndex80Dataset.numberHttpRequest, shapeIndexDataset.numberHttpRequest)

stat_http_req = statisticTemplateMetric(reduction_http_req)

In [29]:
rows = generateTableInfo(results, stat_http_req)

In [30]:
print(tabulate(rows, headers=head, tablefmt="github"))

| query template         | relation execution time   | p-value                           |   average ratio HTTP request |
|------------------------|---------------------------|-----------------------------------|------------------------------|
| interactive-discover-1 | similar                   | 5.01E-01 (different distribution) |                         2.26 |
| interactive-discover-2 | lesser                    | 1.69E-18                          |                         1.01 |
| interactive-discover-3 | lesser                    | 3.97E-03                          |                         1    |
| interactive-discover-4 | similar                   | 4.72E-01 (different distribution) |                         1.41 |
| interactive-discover-5 | similar                   | 1.00E-01 (different distribution) |                         1.01 |
| interactive-discover-6 | lesser                    | 4.44E-17                          |                         0.86 |
| interactive-discover-7

## shape index 0 percent network vs shape index

In [31]:
results = {}

In [32]:
for template, execution_times in shapeIndex0NetworkDataset.executionTime.items():
    
    (p_value_greater, p_value_different, p_value_lesser) = statisticalSignificanceByTemplate(
        execution_times,
        shapeIndexDataset.executionTime[template]
    )

    results[template] = {
        "greater": p_value_greater,
        "lesser": p_value_lesser,
        "different":p_value_different,
    }
reduction_http_req = calculatePercentageReductionSeries(shapeIndex0NetworkDataset.numberHttpRequest, shapeIndexDataset.numberHttpRequest)

stat_http_req = statisticTemplateMetric(reduction_http_req)

In [33]:
rows = generateTableInfo(results, stat_http_req)

In [34]:
print(tabulate(rows, headers=head, tablefmt="github"))

| query template         | relation execution time   | p-value                           |   average ratio HTTP request |
|------------------------|---------------------------|-----------------------------------|------------------------------|
| interactive-discover-1 | greater                   | 3.76E-37                          |                         2.36 |
| interactive-discover-2 | greater                   | 1.24E-02                          |                         0.97 |
| interactive-discover-3 | similar                   | 3.72E-01 (different distribution) |                         1    |
| interactive-discover-4 | greater                   | 1.53E-16                          |                         1.49 |
| interactive-discover-5 | similar                   | 2.23E-01 (different distribution) |                         0.97 |
| interactive-discover-6 | lesser                    | 5.35E-06                          |                         0.71 |
| interactive-discover-7

## shape index 20 percent network vs shape index

In [35]:
results = {}

In [36]:
for template, execution_times in shapeIndex20NetworkDataset.executionTime.items():
    
    (p_value_greater, p_value_different, p_value_lesser) = statisticalSignificanceByTemplate(
        execution_times,
        shapeIndexDataset.executionTime[template]
    )

    results[template] = {
        "greater": p_value_greater,
        "lesser": p_value_lesser,
        "different":p_value_different,
    }
reduction_http_req = calculatePercentageReductionSeries(shapeIndex20NetworkDataset.numberHttpRequest, shapeIndexDataset.numberHttpRequest)

stat_http_req = statisticTemplateMetric(reduction_http_req)

In [37]:
rows = generateTableInfo(results, stat_http_req)

In [38]:
print(tabulate(rows, headers=head, tablefmt="github"))

| query template         | relation execution time   | p-value                           |   average ratio HTTP request |
|------------------------|---------------------------|-----------------------------------|------------------------------|
| interactive-discover-1 | greater                   | 8.73E-36                          |                         2.36 |
| interactive-discover-2 | greater                   | 2.16E-02                          |                         0.97 |
| interactive-discover-3 | similar                   | 2.28E-01 (different distribution) |                         1    |
| interactive-discover-4 | greater                   | 6.38E-17                          |                         1.49 |
| interactive-discover-5 | similar                   | 7.28E-02 (different distribution) |                         0.97 |
| interactive-discover-6 | lesser                    | 9.28E-06                          |                         0.71 |
| interactive-discover-7

## shape index 50 percent network vs shape index

In [39]:
results = {}

In [40]:
for template, execution_times in shapeIndex50NetworkDataset.executionTime.items():
    
    (p_value_greater, p_value_different, p_value_lesser) = statisticalSignificanceByTemplate(
        execution_times,
        shapeIndexDataset.executionTime[template]
    )

    results[template] = {
        "greater": p_value_greater,
        "lesser": p_value_lesser,
        "different":p_value_different,
    }
reduction_http_req = calculatePercentageReductionSeries(shapeIndex50NetworkDataset.numberHttpRequest, shapeIndexDataset.numberHttpRequest)

stat_http_req = statisticTemplateMetric(reduction_http_req)

In [41]:
rows = generateTableInfo(results, stat_http_req)

In [42]:
print(tabulate(rows, headers=head, tablefmt="github"))

| query template         | relation execution time   | p-value                           |   average ratio HTTP request |
|------------------------|---------------------------|-----------------------------------|------------------------------|
| interactive-discover-1 | similar                   | 3.71E-01 (different distribution) |                         1    |
| interactive-discover-2 | similar                   | 3.92E-01 (different distribution) |                         1    |
| interactive-discover-3 | similar                   | 6.65E-01 (different distribution) |                         1    |
| interactive-discover-4 | similar                   | 5.72E-01 (different distribution) |                         1    |
| interactive-discover-5 | similar                   | 2.73E-01 (different distribution) |                         1    |
| interactive-discover-6 | similar                   | 5.45E-01 (different distribution) |                         1    |
| interactive-discover-7

## shape index 80 percent network vs shape index

In [43]:
results = {}

In [44]:
for template, execution_times in shapeIndex80NetworkDataset.executionTime.items():
    
    (p_value_greater, p_value_different, p_value_lesser) = statisticalSignificanceByTemplate(
        execution_times,
        shapeIndexDataset.executionTime[template]
    )

    results[template] = {
        "greater": p_value_greater,
        "lesser": p_value_lesser,
        "different":p_value_different,
    }
reduction_http_req = calculatePercentageReductionSeries(shapeIndex80NetworkDataset.numberHttpRequest, shapeIndexDataset.numberHttpRequest)

stat_http_req = statisticTemplateMetric(reduction_http_req)

In [45]:
rows = generateTableInfo(results, stat_http_req)

In [46]:
print(tabulate(rows, headers=head, tablefmt="github"))

| query template         | relation execution time   | p-value                           |   average ratio HTTP request |
|------------------------|---------------------------|-----------------------------------|------------------------------|
| interactive-discover-1 | similar                   | 6.50E-01 (different distribution) |                         1    |
| interactive-discover-2 | similar                   | 5.17E-01 (different distribution) |                         1    |
| interactive-discover-3 | similar                   | 4.38E-01 (different distribution) |                         1    |
| interactive-discover-4 | similar                   | 6.35E-01 (different distribution) |                         1    |
| interactive-discover-5 | similar                   | 7.73E-01 (different distribution) |                         1    |
| interactive-discover-6 | similar                   | 4.67E-01 (different distribution) |                         1    |
| interactive-discover-7

## shape index inner shapes vs shape index

In [47]:
results = {}

In [48]:
for template, execution_times in shapeIndexInnerDataset.executionTime.items():
    
    (p_value_greater, p_value_different, p_value_lesser) = statisticalSignificanceByTemplate(
        execution_times,
        shapeIndexDataset.executionTime[template]
    )

    results[template] = {
        "greater": p_value_greater,
        "lesser": p_value_lesser,
        "different":p_value_different,
    }
reduction_http_req = calculatePercentageReductionSeries(shapeIndexInnerDataset.numberHttpRequest, shapeIndexDataset.numberHttpRequest)

stat_http_req = statisticTemplateMetric(reduction_http_req)

In [49]:
rows = generateTableInfo(results, stat_http_req)

In [50]:
print(tabulate(rows, headers=head, tablefmt="github"))

| query template         | relation execution time   | p-value                           |   average ratio HTTP request |
|------------------------|---------------------------|-----------------------------------|------------------------------|
| interactive-discover-1 | lesser                    | 9.47E-03                          |                         0.8  |
| interactive-discover-2 | similar                   | 3.50E-01 (different distribution) |                         0.85 |
| interactive-discover-3 | similar                   | 4.21E-01 (different distribution) |                         0.97 |
| interactive-discover-4 | similar                   | 7.05E-01 (different distribution) |                         0.84 |
| interactive-discover-5 | similar                   | 3.18E-01 (different distribution) |                         0.85 |
| interactive-discover-6 | lesser                    | 1.50E-02                          |                         0.84 |
| interactive-discover-7

## shape index minimal shapes vs shape index

In [51]:
results = {}

In [52]:
for template, execution_times in shapeIndexMinimalDataset.executionTime.items():
    
    (p_value_greater, p_value_different, p_value_lesser) = statisticalSignificanceByTemplate(
        execution_times,
        shapeIndexDataset.executionTime[template]
    )

    results[template] = {
        "greater": p_value_greater,
        "lesser": p_value_lesser,
        "different":p_value_different,
    }
reduction_http_req = calculatePercentageReductionSeries(shapeIndexMinimalDataset.numberHttpRequest, shapeIndexDataset.numberHttpRequest)

stat_http_req = statisticTemplateMetric(reduction_http_req)

In [53]:
rows = generateTableInfo(results, stat_http_req)

In [54]:
print(tabulate(rows, headers=head, tablefmt="github"))

| query template         | relation execution time   | p-value                           |   average ratio HTTP request |
|------------------------|---------------------------|-----------------------------------|------------------------------|
| interactive-discover-1 | lesser                    | 3.08E-05                          |                         0.67 |
| interactive-discover-2 | similar                   | 5.18E-01 (different distribution) |                         0.75 |
| interactive-discover-3 | similar                   | 3.90E-01 (different distribution) |                         0.95 |
| interactive-discover-4 | similar                   | 9.17E-01 (different distribution) |                         0.74 |
| interactive-discover-5 | similar                   | 7.27E-01 (different distribution) |                         0.75 |
| interactive-discover-6 | lesser                    | 8.12E-04                          |                         0.74 |
| interactive-discover-7