# Scorpion Synthetic Data Experiment

This notebook contains the code to reproduce the BOExplain results for the experiment with Scorpion's sythetic data and corresponding query. The datasets are in the _data_ folder.

**Run the following cells in order to reproduce the experiment from the paper.**

In [None]:
from boexplain import fmax
import pandas as pd
import numpy as np

## Scorpion's Objective Function

In [None]:
def obj(updated_df):

    # aggregates after removing the tuples satisfying a predicate
    aggs = updated_df.groupby("g")["v"].agg(["sum", "size"])

    # influences
    infs = (aggs_orig["sum"] - aggs["sum"]) / ((aggs_orig["size"] - aggs["size"] + 1e-9) ** 0.2)

    # outlier influences
    outl_inf = np.mean([infs.loc[i] for i in range(5, 10)])

    # the holdout influences
    hold_infs = infs.drop([5, 6, 7, 8, 9], axis="rows")
    # holdout with the largest influence
    max_hold_inf = abs(hold_infs).max()

    # final result from Scorpion
    return 0.5 * outl_inf - 0.5 * max_hold_inf

## 2D Easy

In [None]:
df = pd.read_csv("data/scorpion_synth2d_easy.csv")
# original aggregates before removing data
aggs_orig = df.groupby("g")["v"].agg(["sum", "size"]).copy()

df_rem, viz = fmax(
    data=df,
    f=obj,
    num_cols=["a_0", "a_1"],
    runtime=30,
    runs=10,
    increment=1,
    name="scorpion_synth2d_easy",
    file="scorpion_synth2d_easy.json",
    return_viz=True,
    use_seeds_from_paper=True,
)
viz

## 2D Hard

In [None]:
df = pd.read_csv("scorpion_synth2d_hard.csv")
# original aggregates before removing data
aggs_orig = df.groupby("g")["v"].agg(["sum", "size"]).copy()

df_rem, viz = fmax(
    data=df,
    f=obj,
    num_cols=["a_0", "a_1"],
    n_trials=1000,
    runtime=30,
    runs=10,
    increment=1,
    name="scorpion_synth2d_hard",
    file="scorpion_synth2d_hard.json",
    return_viz=True,
    use_seeds_from_paper=True,
)
viz

## 3D Easy

In [None]:
df = pd.read_csv("scorpion_synth3d_easy.csv")
# original aggregates before removing data
aggs_orig = df.groupby("g")["v"].agg(["sum", "size"]).copy()

df_rem, viz = fmax(
    data=df,
    f=obj,
    num_cols=["a_0", "a_1", "a_2"],
    n_trials=1000,
    runtime=60,
    runs=10,
    increment=1,
    name="scorpion_synth3d_easy",
    file="scorpion_synth3d_easy.json",
    return_viz=True,
    use_seeds_from_paper=True,
)
viz

## 3D Hard

In [None]:
df = pd.read_csv("scorpion_synth3d_hard.csv")
# original aggregates before removing data
aggs_orig = df.groupby("g")["v"].agg(["sum", "size"]).copy()

df_rem, viz = fmax(
    data=df,
    f=obj,
    num_cols=["a_0", "a_1", "a_2"],
    n_trials=1000,
    runtime=60,
    runs=10,
    increment=1,
    name="scorpion_synth3d_hard",
    file="scorpion_synth3d_hard.json",
    return_viz=True,
    use_seeds_from_paper=True,
)
viz

# Recreate Figure 4

After getting the results above, run the following cells to recreate the Figure 4.

In [1]:
import altair as alt
import pandas as pd
import numpy as np
alt.data_transformers.disable_max_rows()
from json import loads

df_scorp = pd.DataFrame({"Algorithm": ["Scorpion"] * 300})
df_scorp["Time (seconds)"] = list(range(1, 31)) * 10
df_scorp["Value"] = 5694.716891020526 # synth2d_easy
df_scorp.loc[df_scorp["Time (seconds)"] <= 8, "Value"] = np.nan

df_mb = pd.DataFrame({"Algorithm": ["MacroBase"] * 300})
df_mb["Time (seconds)"] = list(range(1, 31)) * 10
df_mb["Value"] = 3963.038686566325 # synth2d_easy

experiments = {}

fo = open("results/scorpion_synth2d_easy.json", "r")
for i, line in enumerate(fo.readlines()):
    experiments[i] = loads(line)
fo.close()

import re
df = pd.DataFrame({}, columns=["Algorithm", "Time (seconds)", "Value"])
for i in range(len(experiments)):
    df_new = pd.DataFrame.from_dict({"Algorithm": experiments[i]["cat_enc"],
                        "Time (seconds)": list(range(1, experiments[i]["runtime"] + 1)),
                        "Value": experiments[i]["time_array"]},
                                orient='index').T
    df = df.append(df_new)
df = df.explode("Value")
df = df.set_index(['Algorithm']).apply(pd.Series.explode).reset_index()
df = df.append(df_scorp)
df = df.append(df_mb)
df["Algorithm"] = df["Algorithm"].replace({"individual_contribution_warm_start_top1": "BOExplain"})


num_cols = f"{len(experiments[0]['num_cols'])} numerical columns: "
for i, col in enumerate(experiments[0]["num_cols"]):
    num_cols += f"{col} (range {experiments[0]['num_cols_range'][i][0]} to {experiments[0]['num_cols_range'][i][1]}), "
cat_cols = f"{experiments[0]['cat_cols']} categorical columns: "
for i, col in enumerate(experiments[0]["cat_cols"]):
    cat_cols += f"{col} ({experiments[0]['num_cols_n_uniq'][i]} unique values), "

out_str = f"Experiment: {experiments[0]['name']}. Completed {experiments[0]['n_trials']} iterations for {experiments[0]['runs']} runs. Search space includes "

if len(experiments[0]['num_cols']) > 0:
    out_str += num_cols
    if len(experiments[0]['cat_cols']) > 0:
        out_str += "and "

if len(experiments[0]['cat_cols']) > 0:
    out_str += cat_cols

out_str = f"{out_str[:-2]}."

out_lst = [line.strip() for line in re.findall(r'.{1,140}(?:\s+|$)', out_str)]

line = alt.Chart(df, title="2D Easy").mark_line().encode(
    x='Time (seconds)',
    y=alt.Y('mean(Value)',  title=None),# scale=alt.Scale(domain=[0, 1900])),
    color="Algorithm",
).properties(
#     title=out_lst, #{"text": out_lst, "subtitle": ""}
    width=200,
    height=90
)
band = alt.Chart(df).mark_errorband(extent='stdev').encode(
    x='Time (seconds)',
    y=alt.Y('Value', title=None),#'Mean Objective Function Value'),# scale=alt.Scale(domain=[0, 1900])),
    color="Algorithm"
)
chart1 = line #+ band
# chart = chart.configure_title(
#     anchor='start',
# )
# chart.configure_legend(
#     orient='none',
#     legendX=280,
#     legendY=235,
#     symbolSize=150,
#     title=None,
#     offset=-5,
#     columns=4
# )
chart1

In [2]:
import altair as alt
alt.data_transformers.disable_max_rows()
from json import loads

df_scorp = pd.DataFrame({"Algorithm": ["Scorpion"] * 300})
df_scorp["Time (seconds)"] = list(range(1, 31)) * 10
df_scorp["Value"] = 1650.6794900572083 # synth2d_hard
df_scorp.loc[df_scorp["Time (seconds)"] <= 8, "Value"] = np.nan

df_mb = pd.DataFrame({"Algorithm": ["MacroBase"] * 300})
df_mb["Time (seconds)"] = list(range(1, 31)) * 10
df_mb["Value"] = 1045.8800903157714 # synth2d_hard

experiments = {}

fo = open("results/scorpion_synth2d_hard.json", "r")
for i, line in enumerate(fo.readlines()):
    experiments[i] = loads(line)
fo.close()

import re
df = pd.DataFrame({}, columns=["Algorithm", "Time (seconds)", "Value"])
for i in range(len(experiments)):
    df_new = pd.DataFrame.from_dict({"Algorithm": experiments[i]["cat_enc"],
                        "Time (seconds)": list(range(1, experiments[i]["runtime"] + 1)),
                        "Value": experiments[i]["time_array"]},
                                orient='index').T
    df = df.append(df_new)
df = df.explode("Value")
df = df.set_index(['Algorithm']).apply(pd.Series.explode).reset_index()
df = df.append(df_scorp)
df = df.append(df_mb)
df["Algorithm"] = df["Algorithm"].replace({"individual_contribution_warm_start_top1": "BOExplain"})


num_cols = f"{len(experiments[0]['num_cols'])} numerical columns: "
for i, col in enumerate(experiments[0]["num_cols"]):
    num_cols += f"{col} (range {experiments[0]['num_cols_range'][i][0]} to {experiments[0]['num_cols_range'][i][1]}), "
cat_cols = f"{experiments[0]['cat_cols']} categorical columns: "
for i, col in enumerate(experiments[0]["cat_cols"]):
    cat_cols += f"{col} ({experiments[0]['num_cols_n_uniq'][i]} unique values), "

out_str = f"Experiment: {experiments[0]['name']}. Completed {experiments[0]['n_trials']} iterations for {experiments[0]['runs']} runs. Search space includes "

if len(experiments[0]['num_cols']) > 0:
    out_str += num_cols
    if len(experiments[0]['cat_cols']) > 0:
        out_str += "and "

if len(experiments[0]['cat_cols']) > 0:
    out_str += cat_cols

out_str = f"{out_str[:-2]}."

out_lst = [line.strip() for line in re.findall(r'.{1,140}(?:\s+|$)', out_str)]

line = alt.Chart(df, title="2D Hard").mark_line().encode(
    x='Time (seconds)',
    y=alt.Y('mean(Value)',  title=None),# scale=alt.Scale(domain=[0, 1900])),
    color="Algorithm",
).properties(
#     title=out_lst, #{"text": out_lst, "subtitle": ""}
    width=200,
    height=90
)
band = alt.Chart(df).mark_errorband(extent='stdev').encode(
    x='Time (seconds)',
    y=alt.Y('Value', title=None),#'Mean Objective Function Value'),# scale=alt.Scale(domain=[0, 1900])),
    color="Algorithm"
)
chart2 = line #+ band
# chart = chart.configure_title(
#     anchor='start',
# )
# chart.configure_legend(
#     orient='none',
#     legendX=280,
#     legendY=235,
#     symbolSize=150,
#     title=None,
#     offset=-5,
#     columns=4
# )
chart2

In [3]:
import altair as alt
alt.data_transformers.disable_max_rows()
from json import loads

df_scorp = pd.DataFrame({"Algorithm": ["Scorpion"] * 600})
df_scorp["Time (seconds)"] = list(range(1, 61)) * 10
df_scorp["Value"] = 5910.9467204239545 # synth3d_easy
df_scorp.loc[df_scorp["Time (seconds)"] <= 19, "Value"] = np.nan

df_mb = pd.DataFrame({"Algorithm": ["MacroBase"] * 600})
df_mb["Time (seconds)"] = list(range(1, 61)) * 10
df_mb["Value"] = 3700.209253491158 # synth3d_easy

experiments = {}

fo = open("results/scorpion_synth3d_easy.json", "r")
for i, line in enumerate(fo.readlines()):
    experiments[i] = loads(line)
fo.close()

import re
df = pd.DataFrame({}, columns=["Algorithm", "Time (seconds)", "Value"])
for i in range(len(experiments)):
    df_new = pd.DataFrame.from_dict({"Algorithm": experiments[i]["cat_enc"],
                        "Time (seconds)": list(range(1, experiments[i]["runtime"] + 1)),
                        "Value": experiments[i]["time_array"]},
                                orient='index').T
    df = df.append(df_new)
df = df.explode("Value")
df = df.set_index(['Algorithm']).apply(pd.Series.explode).reset_index()
df = df.append(df_scorp)
df = df.append(df_mb)
df["Algorithm"] = df["Algorithm"].replace({"individual_contribution_warm_start_top1": "BOExplain"})


num_cols = f"{len(experiments[0]['num_cols'])} numerical columns: "
for i, col in enumerate(experiments[0]["num_cols"]):
    num_cols += f"{col} (range {experiments[0]['num_cols_range'][i][0]} to {experiments[0]['num_cols_range'][i][1]}), "
cat_cols = f"{experiments[0]['cat_cols']} categorical columns: "
for i, col in enumerate(experiments[0]["cat_cols"]):
    cat_cols += f"{col} ({experiments[0]['num_cols_n_uniq'][i]} unique values), "

out_str = f"Experiment: {experiments[0]['name']}. Completed {experiments[0]['n_trials']} iterations for {experiments[0]['runs']} runs. Search space includes "

if len(experiments[0]['num_cols']) > 0:
    out_str += num_cols
    if len(experiments[0]['cat_cols']) > 0:
        out_str += "and "

if len(experiments[0]['cat_cols']) > 0:
    out_str += cat_cols

out_str = f"{out_str[:-2]}."

out_lst = [line.strip() for line in re.findall(r'.{1,140}(?:\s+|$)', out_str)]

line = alt.Chart(df, title="3D Easy").mark_line().encode(
    x='Time (seconds)',
    y=alt.Y('mean(Value)',  title=None),# scale=alt.Scale(domain=[0, 1900])),
    color="Algorithm",
).properties(
#     title=out_lst, #{"text": out_lst, "subtitle": ""}
    width=200,
    height=90
)
band = alt.Chart(df).mark_errorband(extent='stdev').encode(
    x='Time (seconds)',
    y=alt.Y('Value', title=None),#'Mean Objective Function Value'),# scale=alt.Scale(domain=[0, 1900])),
    color="Algorithm"
)
chart3 = line #+ band
# chart = chart.configure_title(
#     anchor='start',
# )
# chart.configure_legend(
#     orient='none',
#     legendX=280,
#     legendY=235,
#     symbolSize=150,
#     title=None,
#     offset=-5,
#     columns=4
# )
chart3

In [4]:
import altair as alt
alt.data_transformers.disable_max_rows()
from json import loads

df_scorp = pd.DataFrame({"Algorithm": ["Scorpion"] * 600})
df_scorp["Time (seconds)"] = list(range(1, 61)) * 10
df_scorp["Value"] = 1627.2080886677436 # synth3d_hard
df_scorp.loc[df_scorp["Time (seconds)"] <= 19, "Value"] = np.nan

df_mb = pd.DataFrame({"Algorithm": ["MacroBase"] * 600})
df_mb["Time (seconds)"] = list(range(1, 61)) * 10
df_mb["Value"] = 1029.9701728953212 # synth3d_hard

experiments = {}

fo = open("results/scorpion_synth3d_hard.json", "r")
for i, line in enumerate(fo.readlines()):
    experiments[i] = loads(line)
fo.close()

import re
df = pd.DataFrame({}, columns=["Algorithm", "Time (seconds)", "Value"])
for i in range(len(experiments)):
    df_new = pd.DataFrame.from_dict({"Algorithm": experiments[i]["cat_enc"],
                        "Time (seconds)": list(range(1, experiments[i]["runtime"] + 1)),
                        "Value": experiments[i]["time_array"]},
                                orient='index').T
    df = df.append(df_new)
df = df.explode("Value")
df = df.set_index(['Algorithm']).apply(pd.Series.explode).reset_index()
df = df.append(df_scorp)
df = df.append(df_mb)
df["Algorithm"] = df["Algorithm"].replace({"individual_contribution_warm_start_top1": "BOExplain"})


num_cols = f"{len(experiments[0]['num_cols'])} numerical columns: "
for i, col in enumerate(experiments[0]["num_cols"]):
    num_cols += f"{col} (range {experiments[0]['num_cols_range'][i][0]} to {experiments[0]['num_cols_range'][i][1]}), "
cat_cols = f"{experiments[0]['cat_cols']} categorical columns: "
for i, col in enumerate(experiments[0]["cat_cols"]):
    cat_cols += f"{col} ({experiments[0]['num_cols_n_uniq'][i]} unique values), "

out_str = f"Experiment: {experiments[0]['name']}. Completed {experiments[0]['n_trials']} iterations for {experiments[0]['runs']} runs. Search space includes "

if len(experiments[0]['num_cols']) > 0:
    out_str += num_cols
    if len(experiments[0]['cat_cols']) > 0:
        out_str += "and "

if len(experiments[0]['cat_cols']) > 0:
    out_str += cat_cols

out_str = f"{out_str[:-2]}."

out_lst = [line.strip() for line in re.findall(r'.{1,140}(?:\s+|$)', out_str)]

line = alt.Chart(df, title="3D Hard").mark_line().encode(
    x='Time (seconds)',
    y=alt.Y('mean(Value)',  title=None),# scale=alt.Scale(domain=[0, 1900])),
    color="Algorithm",
).properties(
#     title=out_lst, #{"text": out_lst, "subtitle": ""}
    width=200,
    height=90
)
band = alt.Chart(df).mark_errorband(extent='stdev').encode(
    x='Time (seconds)',
    y=alt.Y('Value', title=None),#'Mean Objective Function Value'),# scale=alt.Scale(domain=[0, 1900])),
    color="Algorithm"
)
chart4 = line #+ band
# chart = chart.configure_title(
#     anchor='start',
# )
# chart.configure_legend(
#     orient='none',
#     legendX=280,
#     legendY=235,
#     symbolSize=150,
#     title=None,
#     offset=-5,
#     columns=4
# )
chart4

In [5]:
alt.hconcat(chart1, chart2, chart3, chart4, spacing=0).configure_legend(
    orient='none',
    legendX=290,
    legendY=130,
    symbolSize=700,
    title=None,
    offset=-5,
    columns=4,
    labelFontSize=15,
).configure_axis(
    labelFontSize=15,
    titleFontSize=15
).configure_title(
    fontSize=15
)