In [1]:
cd ~/bench

/home/soda/rcappuzz/work/benchmark-join-suggestions


In [20]:
import polars as pl
from pathlib import Path
import polars.selectors as cs
from tqdm.notebook import tqdm
from joblib import Parallel, delayed

In [3]:
path_wordnet = Path()
path_wordnet_vldb = Path()
path_wordnet_vldb_wide = Path()
path_wordnet_vldb_3 = Path()
path_binary = Path()
path_open_data = Path()

In [4]:
path_list = list(
    map(
        Path,
        [
            "data/yadl/wordnet_full/",
            "data/yadl/wordnet_vldb/",
            "data/yadl/wordnet_vldb_wide/",
            "data/yadl/wordnet_vldb_3/",
            "data/yadl/binary_update/",
            "data/open_data_us/",
        ],
    )
)

In [5]:
def table_profile(table_path, data_lake):
    df = pl.read_parquet(table_path)
    n_num = df.select(cs.numeric()).shape[1]
    c_num = df.select(~cs.numeric()).shape[1]
    if len(df) > 0:
        avg_null = df.null_count().mean_horizontal().item() / len(df)
        # avg_null = df.null_count().mean_horizontal().item()/len(df)
    else:
        avg_null = 0
    d = {
        "data_lake": data_lake,
        "table": table_path.stem,
        "num_attr": n_num,
        "cat_attr": c_num,
        "n_rows": len(df),
        "n_cols": len(df.columns),
        "avg_null": avg_null,
    }

    return d

In [23]:
def get_stats(df: pl.DataFrame):
    return df.select(
        pl.col("data_lake").first(),
        pl.col("n_tables").first(),
        pl.col("n_rows").sum().alias("tot_rows"),
        pl.col("n_cols").sum().alias("tot_rows"),
        pl.col("num_attr").mean().alias("mean_num_attr"),
        pl.col("num_attr").median().alias("median_num_attr"),
        pl.col("cat_attr").mean().alias("mean_cat_attr"),
        pl.col("cat_attr").median().alias("median_cat_attr"),
        pl.col("n_rows").mean().alias("mean_n_rows"),
        pl.col("n_rows").median().alias("median_n_rows"),
        pl.col("n_cols").mean().alias("mean_n_cols"),
        pl.col("n_cols").median().alias("median_n_cols"),
        pl.col("avg_null").mean().alias("mean_avg_null"),
        pl.col("avg_null").median().alias("median_avg_null"),
    )

In [12]:
stats = []

In [19]:
for path in path_list:
    profiles = []
    # for tab in path.glob("**/*.parquet"):

    profiles = Parallel(n_jobs=8, verbose=0)(
        delayed(table_profile)(tab, path.stem)
        for tab in tqdm(
            path.glob("**/*.parquet"),
            total=sum(1 for _ in path.glob("**/*.parquet")),
            position=0,
            leave=False,
            desc=path.stem,
        )
    )

    # for tab in tqdm(
    #     path.glob("**/*.parquet"), total=sum(1 for _ in path.glob("**/*.parquet")), position=0, leave=False, desc=path.stem
    # ):

    #     d = table_profile(tab, path.stem)
    #     profiles.append(d)
    df = pl.from_dicts(profiles).with_columns(pl.lit(len(profiles)).alias("n_tables"))
    stats.append(get_stats(df))
df_stats = pl.concat(stats)

                                                                          

In [10]:
df_stats = pl.concat(stats)

In [21]:
df_stats

data_lake,n_tables,mean_num_attr,median_num_attr,mean_cat_attr,median_cat_attr,mean_n_rows,median_n_rows,mean_n_cols,median_n_cols,mean_avg_null,median_avg_null
str,i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""wordnet_full""",30072,0.389698,0.0,2.775805,3.0,22343.919826,927.0,3.165503,3.0,0.309895,0.331624
"""wordnet_vldb""",869,1.761795,2.0,6.433832,4.0,9220.859609,74.0,8.195627,6.0,0.444155,0.497727
"""wordnet_vldb_w…",50922,3.615058,3.0,9.071344,6.0,24007.617709,1706.0,12.686403,10.0,0.631575,0.669716
"""wordnet_vldb_3…",3162,3.5,3.0,8.6926,6.0,23228.833966,1602.0,12.1926,10.0,0.603582,0.663548
"""binary_update""",70,0.3,0.0,1.7,2.0,287134.328571,40407.5,2.0,2.0,4e-06,0.0
"""open_data_us""",5591,11.097836,3.0,12.759256,7.0,17124.504561,1000.0,23.857092,14.0,0.094168,0.010989


In [28]:
df_stats.transpose(include_header=True, column_names="data_lake").to_pandas().style.format(precision=2)

Unnamed: 0,column,wordnet_full,wordnet_vldb,wordnet_vldb_wide,wordnet_vldb_3,binary_update,open_data_us
0,n_tables,30072.0,869.0,50922.0,3162.0,70.0,5591.0
1,mean_num_attr,0.39,1.76,3.62,3.5,0.3,11.1
2,median_num_attr,0.0,2.0,3.0,3.0,0.0,3.0
3,mean_cat_attr,2.78,6.43,9.07,8.69,1.7,12.76
4,median_cat_attr,3.0,4.0,6.0,6.0,2.0,7.0
5,mean_n_rows,22343.92,9220.86,24007.62,23228.83,287134.33,17124.5
6,median_n_rows,927.0,74.0,1706.0,1602.0,40407.5,1000.0
7,mean_n_cols,3.17,8.2,12.69,12.19,2.0,23.86
8,median_n_cols,3.0,6.0,10.0,10.0,2.0,14.0
9,mean_avg_null,0.31,0.44,0.63,0.6,0.0,0.09


In [29]:
df_stats.transpose(include_header=True, column_names="data_lake").write_csv(
    "stats_data_lakes.csv"
)