In [3]:
cd ~/work/benchmark-join-suggestions/

/home/soda/rcappuzz/work/benchmark-join-suggestions


In [4]:
import polars as pl
import pandas as pd
import json
from pathlib import Path
from tqdm import tqdm

In [5]:
metadata_dir = Path("data/metadata")

In [6]:
variant = "binary"

In [39]:
avg_df = []
variants = ["wordnet_big", "binary"]
for variant in variants:
    print(variant)
    total_files = sum([1 for _ in Path(metadata_dir, variant).glob("**/*.json")])
    mdata_files = Path(metadata_dir, variant).glob("**/*.json")
    total_size = 0
    list_dicts = []
    for f in tqdm(mdata_files, total=total_files):
        with open(f, "r") as fp:
            mdata = json.load(fp)
            table_path = Path(mdata["full_path"])
            file_size = table_path.stat().st_size
            total_size += file_size
            
            table = pl.read_parquet(table_path)
            subj_avg = table.groupby("subject").count().mean()["count"].item()
            subj_median = table.groupby("subject").count().median()["count"].item()
            if table.shape[0] == 1:
                print(table_path)
                continue
            dd = {
                "table_name": table_path.stem,
                "num_rows": table.shape[0],
                "num_cols": table.shape[1],
                "file_size": file_size,
                "subj_avg": subj_avg,
                "subj_median": subj_median,
            }
            list_dicts.append(dd)

    df_stat = pl.from_dicts(list_dicts)
    avg_stats = df_stat.select(
        pl.lit(variant).alias("Variant"),
        pl.col("num_rows").mean().alias("avg_num_rows"),
        pl.col("num_rows").median().alias("median_num_rows"),
        pl.col("num_rows").max().alias("max_num_rows"),
        pl.col("num_rows").min().alias("min_num_rows"),
        pl.col("num_cols").max().alias("max_num_cols"), 
        pl.col("num_cols").min().alias("min_num_cols"), 
        pl.col("file_size").mean().alias("avg_file_size")/1e6,
        pl.col("table_name").count().alias("num_tables"),
        pl.lit(total_size).alias("total_size")/1e6,
        pl.col("subj_avg").mean(),
        pl.col("subj_median").mean(),
    )   
    avg_df.append(avg_stats)
     

wordnet_big


100%|██████████| 4080/4080 [00:26<00:00, 154.51it/s]


binary


100%|██████████| 67/67 [00:08<00:00,  7.49it/s]

/storage/store/work/rcappuzz/yago3-dl/binary/yago_binary_startedOnDate.parquet
/storage/store/work/rcappuzz/yago3-dl/binary/yago_binary_hasNeighbor.parquet





In [41]:
stat_variants = pl.concat(avg_df)

In [42]:
stat_variants.select(
    pl.col("Variant"),
    pl.col("num_tables").alias("# Tables"),
    pl.col("avg_num_rows").alias("Avg. rows"),
    pl.col("median_num_rows").alias("Med. rows"),
    pl.col("avg_file_size").alias("Avg. size"),
    pl.col("total_size").alias("Total size"),
)

Variant,# Tables,Avg. rows,Med. rows,Avg. size,Total size
str,u32,f64,f64,f64,f64
"""wordnet_big""",4080,16031.73848,164.0,0.135622,553.339654
"""binary""",65,322672.030769,44340.0,4.520355,293.824463


In [34]:
spd = stat_variants.drop(["Total size", "Max. # rows", 	"Max. # columns"]).to_pandas()

SchemaFieldNotFoundError: Total size

In [None]:
spd.style.format(precision=2, subset=["Avg. file size (MB)", "Total size (MB)"]).format(subset=["Avg. # rows"], precision=0).to_latex("tab.tex", hrules=True)

In [54]:
spd.style.to_latex("tab.tex", hrules=True)

# Base table stats

In [7]:
data_dir = Path("data/metadata/queries")
tabs_mdata_paths = list(data_dir.glob("*.json"))

In [8]:
stats_list = []
for tab_mdata_pth in tabs_mdata_paths:
    with open(tab_mdata_pth, "r") as fp:
        tab_mdata = json.load(fp)
        tab_path = tab_mdata["full_path"]
        tab = pl.read_parquet(tab_path).unique()
        cat_feat = 0
        num_feat = 0
        for col, typ in tab.schema.items():
            if typ == pl.Utf8:
                cat_feat += 1
            else:
                num_feat += 1
        stats = {
            "tab_name": tab_mdata["df_name"],
            # "tab_source": "add",
            "num_rows": tab.shape[0],
            # "num_cols": tab.shape[1],
            "cat_feat": cat_feat,
            "num_feat": num_feat,
            # "target": "add"
        }
        stats_list.append(stats)

In [9]:
base_table_stats = pl.from_dicts(stats_list)
base_table_stats = base_table_stats.select(
    pl.col("tab_name").alias("Table name"),
    pl.col("num_rows").alias("Rows"),
    pl.col("cat_feat").alias("$C$"),
    pl.col("num_feat").alias("$N$"),
)

In [11]:
base_table_stats.to_pandas().style.to_latex("base_tables.tex", hrules=True)

In [10]:
base_table_stats

Table name,Rows,$C$,$N$
str,i64,i64,i64
"""presidential-r…",22084,12,1
"""company-employ…",3107,8,2
"""movies-prepare…",3826,14,5
"""us-accidents-p…",20000,23,28
"""us-accidents-p…",5222,10,3
"""presidential-r…",21289,6,1
