In [22]:
cd ~/bench

/home/soda/rcappuzz/work/benchmark-join-suggestions


This notebook is used to combine all results from various runs in to a single file for simplicity and better storage. 

In [47]:
import polars as pl
from pathlib import Path
from src.utils.logging import read_and_process, read_logs

cfg = pl.Config()
cfg.set_fmt_str_lengths(150)
cfg.set_tbl_rows(-1)

polars.config.Config

In [24]:
dest_path = Path("results/overall")
overall_list = []

## Wordnet 10k

In [25]:
r_path = "results/logs/0428-hde9c7an"
df_raw = read_logs(exp_name=None, exp_path=r_path)
df_raw = df_raw.fill_null(0).with_columns(
    pl.lit(0.0).alias("auc"), pl.lit(0.0).alias("f1score")
)
overall_list.append(df_raw)

In [26]:
# df_raw.write_parquet(Path(dest_path, "wordnet_10k-first.parquet"))

## Wordnet old + Binary

In [27]:
r_path = "results/logs/0429-oot6vo5b"
df_raw = read_logs(exp_name=None, exp_path=r_path)
df_raw = df_raw.fill_null(0).with_columns(
    pl.lit(0.0).alias("auc"), pl.lit(0.0).alias("f1score")
)
overall_list.append(df_raw)

In [28]:
# df_raw.filter(pl.col("target_dl") == "binary_update").write_parquet(
#     Path(dest_path, "binary_update-first.parquet")
# )
# df_raw.filter(pl.col("target_dl") == "wordnet_full").write_parquet(
#     Path(dest_path, "wordnet_full-first.parquet")
# )

## Open Data US

In [29]:
r1_path = "results/logs/0434-7r9ecumo"
df_1 = read_logs(exp_name=None, exp_path=r1_path)
df_1 = df_1.fill_null(0).with_columns(
    pl.lit(0.0).alias("auc"), pl.lit(0.0).alias("f1score")
)

r2_path = "results/logs/0435-y2ljs95x"
df_2 = read_logs(exp_name=None, exp_path=r2_path)
df_2 = (
    df_2.fill_null(0)
    .with_columns(pl.lit(0.0).alias("auc"), pl.lit(0.0).alias("f1score"))
    .filter(~pl.col("base_table").str.contains("schools"))
)

r_schools_path = "results/logs/0444-4ptu03x5"
df_3 = read_logs(exp_name=None, exp_path=r_schools_path)
df_3 = (
    df_3.fill_null(0)
    .with_columns(pl.lit(0.0).alias("r2score"), pl.lit(0.0).alias("rmse"))
    .filter(~pl.col("base_table").str.contains("schools"))
)

df_raw = pl.concat([df_1, df_2, df_3])
df_raw.filter(pl.col("base_table").str.contains("depleted")).write_parquet(
    Path(dest_path, "open_data_us-first.parquet")
)

# df_raw.write_parquet(Path(dest_path, "open_data_us-first-with_full_tables.parquet"))

overall_list.append(df_raw)

## Movies large

In [30]:
r_path = "results/logs/0451-gm4f305a"
df_raw = read_logs(exp_name=None, exp_path=r_path)
df_raw = df_raw.fill_null(0).with_columns(
    pl.lit(0.0).alias("auc"), pl.lit(0.0).alias("f1score")
)
overall_list.append(df_raw)

## US Accidents large

In [31]:
r_path = "results/logs/0453-sjkk0koq"
df_raw = read_logs(exp_name=None, exp_path=r_path)
df_raw = df_raw.fill_null(0).with_columns(
    pl.lit(0.0).alias("auc"), pl.lit(0.0).alias("f1score")
)
overall_list.append(df_raw)

# Extra runs
The following runs are not added to the "overall" logs because they were not run with all methods.

## Wordnet 3K

In [32]:
r_path = "results/logs/0424-xc16vll9"
df_raw = read_logs(exp_name=None, exp_path=r_path)
df_raw = df_raw.fill_null(0).with_columns(
    pl.lit(0.0).alias("auc"), pl.lit(0.0).alias("f1score")
)
overall_list.append(df_raw)
# df_raw.write_parquet(Path(dest_path, "wordnet_3k-first.parquet"))

## Wordnet 50k

In [33]:
r_path = "results/logs/0430-e6fv2b39"
df_raw = read_logs(exp_name=None, exp_path=r_path)
df_raw = df_raw.fill_null(0).with_columns(
    pl.lit(0.0).alias("auc"), pl.lit(0.0).alias("f1score")
)
overall_list.append(df_raw)
# df_raw.write_parquet(Path(dest_path, "wordnet_50k-first.parquet"))

## Aggregation mean

In [34]:
# Only wordnet 10k
r_path = "results/logs/0438-yboxtpbz"
df_raw = read_logs(exp_name=None, exp_path=r_path)
df_raw = df_raw.fill_null(0).with_columns(
    pl.lit(0.0).alias("auc"), pl.lit(0.0).alias("f1score")
)

# df_raw.write_parquet(Path(dest_path, "wordnet_10k-mean.parquet"))

## New tables

In [35]:
# Only wordnet 10k
r_path = "results/logs/0451-gm4f305a"
df_raw = read_logs(exp_name=None, exp_path=r_path)
df_raw = df_raw.fill_null(0).with_columns(
    pl.lit(0.0).alias("auc"), pl.lit(0.0).alias("f1score")
)

overall_list.append(df_raw)
# df_raw.write_parquet(Path(dest_path, "wordnet_10k-mean.parquet"))

## Combine all runs (aggr=first)

In [38]:
for df in overall_list:
    print(df.schema)
df_overall = pl.concat(overall_list)
df_overall.write_parquet(Path(dest_path, "overall_first.parquet"))

OrderedDict([('scenario_id', Int64), ('status', String), ('target_dl', String), ('jd_method', String), ('base_table', String), ('query_column', String), ('estimator', String), ('aggregation', String), ('chosen_model', String), ('fold_id', Int64), ('time_fit', Float64), ('time_predict', Float64), ('time_run', Float64), ('time_prepare', Float64), ('time_model_train', Float64), ('time_join_train', Float64), ('time_model_predict', Float64), ('time_join_predict', Float64), ('peak_fit', Float64), ('peak_predict', Float64), ('peak_test', Float64), ('r2score', Float64), ('rmse', Float64), ('f1score', Float64), ('auc', Float64), ('n_cols', String), ('budget_type', String), ('budget_amount', Int64), ('epsilon', Float64)])
OrderedDict([('scenario_id', Int64), ('status', String), ('target_dl', String), ('jd_method', String), ('base_table', String), ('query_column', String), ('estimator', String), ('aggregation', String), ('chosen_model', String), ('fold_id', Int64), ('time_fit', Float64), ('time_p

In [52]:
g = df_overall.filter(pl.col("jd_method") == "starmie").select("target_dl", "base_table").sort("target_dl").group_by(["target_dl"], maintain_order=True)
for gidx, group in g:
    print(group.unique().sort("base_table"))


shape: (7, 2)
┌───────────────┬────────────────────────────────────┐
│ target_dl     ┆ base_table                         │
│ ---           ┆ ---                                │
│ str           ┆ str                                │
╞═══════════════╪════════════════════════════════════╡
│ binary_update ┆ company_employees-yadl-depleted    │
│ binary_update ┆ housing_prices-yadl-depleted       │
│ binary_update ┆ movies-yadl-depleted               │
│ binary_update ┆ movies_vote-yadl-depleted          │
│ binary_update ┆ us_accidents-yadl-depleted         │
│ binary_update ┆ us_county_population-yadl-depleted │
│ binary_update ┆ us_elections-yadl-depleted         │
└───────────────┴────────────────────────────────────┘
shape: (7, 2)
┌──────────────┬────────────────────────────────────┐
│ target_dl    ┆ base_table                         │
│ ---          ┆ ---                                │
│ str          ┆ str                                │
╞══════════════╪═════════════════════════