In [1]:
cd ~/work/benchmark-join-suggestions/

/home/soda/rcappuzz/work/benchmark-join-suggestions


In [2]:
import polars as pl
import pandas as pd
import json
from pathlib import Path
from tqdm import tqdm

In [3]:
metadata_dir = Path("data/metadata")

In [4]:
variant = "binary"

In [5]:
avg_df = []
variants = ["binary","wordnet", "full"]
for variant in variants:
    print(variant)
    total_files = sum([1 for _ in Path(metadata_dir, variant).glob("**/*.json")])
    mdata_files = Path(metadata_dir, variant).glob("**/*.json")
    total_size = 0
    list_dicts = []
    for f in tqdm(mdata_files, total=total_files):
        with open(f, "r") as fp:
            mdata = json.load(fp)
            table_path = Path(mdata["full_path"])
            file_size = table_path.stat().st_size
            total_size += file_size
            
            table = pl.read_parquet(table_path)
            subj_avg = table.groupby("subject").count().mean()["count"].item()
            subj_median = table.groupby("subject").count().median()["count"].item()
            if table.shape[0] == 1:
                print(table_path)
                continue
            dd = {
                "table_name": table_path.stem,
                "num_rows": table.shape[0],
                "num_cols": table.shape[1],
                "file_size": file_size,
                "subj_avg": subj_avg,
                "subj_median": subj_median,
            }
            list_dicts.append(dd)

    df_stat = pl.from_dicts(list_dicts)
    avg_stats = df_stat.select(
        pl.lit(variant).alias("Variant"),
        pl.col("num_rows").mean().alias("avg_num_rows"),
        pl.col("num_rows").max().alias("max_num_rows"),
        pl.col("num_rows").min().alias("min_num_rows"),
        pl.col("num_cols").max().alias("max_num_cols"), 
        pl.col("num_cols").min().alias("min_num_cols"), 
        pl.col("file_size").mean().alias("avg_file_size")/1e6,
        pl.col("table_name").count().alias("num_tables"),
        pl.lit(total_size).alias("total_size")/1e6,
        pl.col("subj_avg").mean(),
        pl.col("subj_median").mean(),
    )   
    avg_df.append(avg_stats)
     

binary


100%|██████████| 67/67 [00:08<00:00,  7.92it/s]


/storage/store/work/rcappuzz/yago3-dl/binary/yago_binary_startedOnDate.parquet
/storage/store/work/rcappuzz/yago3-dl/binary/yago_binary_hasNeighbor.parquet
wordnet


 24%|██▍       | 125/513 [00:04<00:03, 129.29it/s]

/storage/store/work/rcappuzz/yago3-dl/wordnet/subtabs/yago_wordnet_company/company_hasBudget_hasLatitude.parquet
/storage/store/work/rcappuzz/yago3-dl/wordnet/subtabs/yago_wordnet_company/company_hasBudget_hasLongitude.parquet


100%|██████████| 513/513 [00:06<00:00, 79.39it/s] 


full


 12%|█▏        | 68/580 [00:08<00:17, 28.52it/s]

/storage/store/work/rcappuzz/yago3-dl/binary/yago_binary_startedOnDate.parquet
/storage/store/work/rcappuzz/yago3-dl/binary/yago_binary_hasNeighbor.parquet


 34%|███▍      | 196/580 [00:12<00:03, 109.98it/s]

/storage/store/work/rcappuzz/yago3-dl/wordnet/subtabs/yago_wordnet_company/company_hasBudget_hasLatitude.parquet
/storage/store/work/rcappuzz/yago3-dl/wordnet/subtabs/yago_wordnet_company/company_hasBudget_hasLongitude.parquet


100%|██████████| 580/580 [00:14<00:00, 39.94it/s] 


In [6]:
stat_variants = pl.concat(avg_df)

In [7]:
stat_variants

Variant,avg_num_rows,max_num_rows,min_num_rows,max_num_cols,min_num_cols,avg_file_size,num_tables,total_size,subj_avg,subj_median
str,f64,i64,i64,i64,i64,f64,u32,f64,f64,f64
"""binary""",322672.030769,2776052,5,2,2,4.520355,65,293.824463,2.614726,1.661538
"""wordnet""",31838.266145,4254254,3,15,4,0.296781,511,151.657423,7.960086,3.210372
"""full""",64658.048611,4254254,3,15,2,0.7734,576,445.481886,7.356877,3.03559


In [None]:
stat_variants

Variant,Avg. # rows,Avg. file size (MB),Number of tables,Total size,Max. # rows,Max. # columns,subj_avg,subj_median,Total size (MB)
str,f64,f64,u32,i32,i64,i64,f64,f64,f64
"""binary""",313040.059701,4.38544,67,293824463,2776052,2,2.566525,1.641791,293.824463
"""wordnet""",31714.14425,0.295629,513,151657423,4254254,15,7.932951,3.201754,151.657423
"""full""",64212.137931,0.768072,580,445481886,4254254,15,7.313036,3.021552,445.481886


In [None]:
spd = stat_variants.drop(["Total size", "Max. # rows", 	"Max. # columns"]).to_pandas()

In [None]:
spd.style.format(precision=2, subset=["Avg. file size (MB)", "Total size (MB)"]).format(subset=["Avg. # rows"], precision=0).to_latex("tab.tex", hrules=True)

In [54]:
spd.style.to_latex("tab.tex", hrules=True)

# Base table stats

In [10]:
data_dir = Path("data/metadata/queries")
tabs_mdata_paths = list(data_dir.glob("*.json"))

In [27]:
stats_list = []
for tab_mdata_pth in tabs_mdata_paths:
    with open(tab_mdata_pth, "r") as fp:
        tab_mdata = json.load(fp)
        tab_path = tab_mdata["full_path"]
        tab = pl.read_parquet(tab_path)    
        cat_feat = 0
        num_feat = 0
        for col, typ in tab.schema.items():
            if typ == pl.Utf8:
                cat_feat += 1
            else:
                num_feat += 1
        stats = {
            "tab_name": tab_mdata["df_name"],
            "tab_source": "add",
            "num_rows": tab.shape[0],
            "num_cols": tab.shape[1],
            "cat_feat": cat_feat,
            "num_feat": num_feat,
            "target": "add"
        }
        stats_list.append(stats)

In [28]:
base_table_stats = pl.from_dicts(stats_list)

In [30]:
base_table_stats.to_pandas().style.to_latex("base_tables.tex", hrules=True)