In [21]:
cd ~/bench

/home/soda/rcappuzz/work/benchmark-join-suggestions


This notebook is used to combine all results from various runs in to a single file for simplicity and better storage. 

In [22]:
import polars as pl
from pathlib import Path
from src.utils.logging import read_and_process, read_logs

In [23]:
cfg = pl.Config()
cfg.set_fmt_str_lengths(150)

polars.config.Config

In [24]:
dest_path = Path("results/overall")
overall_list = []

## Full run depleted Wordnet aggregation first

`results/logs/0363-mp1a3by4`
- Full run
- All tables
- Wordnet full
- Aggregation first 

In [25]:
r_path = "results/logs/0363-mp1a3by4"
df_raw = read_logs(exp_name=None, exp_path=r_path)
overall_list.append(df_raw)
df_raw.write_parquet(Path(dest_path, "wordnet_general_first.parquet"))

## Full run depleted Wordnet aggregation mean

`results/logs/0383-ptn1bncl`
- Full run
- All tables
- Wordnet full
- Aggregation mean

In [26]:
r_path = "results/logs/0383-ptn1bncl"
df_raw = read_logs(exp_name=None, exp_path=r_path)
df_raw.write_parquet(Path(dest_path, "wordnet_general_mean.parquet"))

## Full run full tables Wordnet aggregation first

`results/logs/0389-4jmvw4qc`
- Full run
- Full tables
- All tables
- Wordnet full
- Aggregation first

In [27]:
r_path = "results/logs/0389-4jmvw4qc"
df_raw = read_logs(exp_name=None, exp_path=r_path)
df_raw.write_parquet(Path(dest_path, "wordnet_general_first_full-tables.parquet"))

## Full run Binary aggregation first

`results/logs/0387-df7e67z4`
- Full run
- Full tables
- All tables
- Binary
- Aggregation first

In [28]:
r_path = "results/logs/0387-df7e67z4"
df_raw = read_logs(exp_name=None, exp_path=r_path)
overall_list.append(df_raw)
df_raw.write_parquet(Path(dest_path, "binary_general_first.parquet"))

## Open Data full run

```
paths = [
    "results/logs/0372-35kmeaz3",
    "results/logs/0373-csmtrlcd",
    "results/logs/0374-7226gvfk",
    "results/logs/0376-c8xw7pry",
    "results/logs/0377-q83k1stf",
    "results/logs/0381-roaef1ce",
]
```
`results/logs/0375-mjbya9c5` is a duplicate of 0374 (the configuration is the same)

In [29]:
paths = [
    "results/logs/0372-35kmeaz3",
    "results/logs/0373-csmtrlcd",
    "results/logs/0374-7226gvfk",
    "results/logs/0376-c8xw7pry",
    "results/logs/0377-q83k1stf",
    "results/logs/0381-roaef1ce",
]

open_data_list = []

for path in paths:
    df_raw = read_logs(exp_name=None, exp_path=path)
    open_data_list.append(df_raw)

df_concat = pl.concat(open_data_list)
df_concat.write_parquet(Path(dest_path, "open_data_general_first.parquet"))
overall_list.append(df_concat)

In [30]:
df_concat.select(pl.col("base_table").unique()).to_series().to_list()

['movies-depleted_title-open_data',
 'us_accidents-depleted_County-open_data',
 'us_elections-depleted_county_name-open_data',
 'housing_prices-depleted_County-open_data',
 'movies_vote-depleted_title-open_data',
 'company_employees-depleted_name-open_data']

In [31]:
df_overall = pl.concat(overall_list)
df_overall.write_parquet(Path(dest_path, "overall_first.parquet"))

## Aggregation

`results/logs/0382-bfft0brr`
- Only best single join and highest containment
- All tables
- Wordnet full
- DFS 

In [32]:
r_path = "results/logs/0382-bfft0brr"
df_raw = read_logs(exp_name=None, exp_path=r_path)
# overall_list.append(df_raw)
df_raw.write_parquet(Path(dest_path, "wordnet_dfs.parquet"))
drop = [
    "status",
    "rmse",
    "f1score",
    "auc",
    "n_cols",
    "budget_type",
    "budget_amount",
    "epsilon",
]
df_first = pl.read_parquet(Path(dest_path, "wordnet_general_first.parquet"))
df_mean = pl.read_parquet(Path(dest_path, "wordnet_general_mean.parquet"))
df_c = pl.concat([df_first.drop(drop), df_mean.drop(drop), df_raw.drop(drop)])
f = {"jd_method": "exact_matching"}
df_aggr = df_c.filter(**f).filter(
    pl.col("estimator").is_in(["highest_containment", "best_single_join"])
)
df_aggr = df_aggr.filter(
    pl.col("base_table").is_in(
        df_aggr.filter(pl.col("aggregation") == "dfs").select(pl.col("base_table"))
    )
)
df_aggr.write_parquet(Path(dest_path, "wordnet_aggr.parquet"))

In [33]:
df_aggr.select(pl.col("base_table").unique())

base_table
str
"""us_elections-yadl-depleted"""
"""movies-yadl-depleted"""
"""us_county_population-yadl-depleted"""
"""us_accidents-yadl-depleted"""
"""movies_vote-yadl-depleted"""
"""company_employees-yadl-depleted"""


## Aggregation

`results/logs/0386-sh4fb419`
- Only best single join and highest containment
- All tables
- Wordnet full
- DFS 

In [34]:
r_path = "results/logs/0386-sh4fb419"
df_raw = read_logs(exp_name=None, exp_path=r_path)
# overall_list.append(df_raw)
df_raw.write_parquet(Path(dest_path, "binary_mean_dfs.parquet"))
df_base = pl.read_parquet(Path(dest_path, "binary_general_first.parquet"))
drop = [
    "status",
    "rmse",
    "f1score",
    "auc",
    "n_cols",
    "budget_type",
    "budget_amount",
    "epsilon",
]
df_c = pl.concat([df_base.drop(drop), df_raw.drop(drop)])
f = {"jd_method": "exact_matching"}
df_aggr = df_c.filter(**f).filter(
    pl.col("estimator").is_in(["highest_containment", "best_single_join"])
)
df_aggr.write_parquet(Path(dest_path, "binary_aggr.parquet"))