In [1]:
cd ~/bench

/home/soda/rcappuzz/work/benchmark-join-suggestions


In [43]:
import polars as pl
from pathlib import Path
import polars.selectors as cs
from tqdm import tqdm

In [7]:
path_wordnet = Path("data/yadl/wordnet_full/")
path_wordnet_vldb = Path("data/yadl/wordnet_vldb/")
path_wordnet_vldb_wide = Path("data/yadl/wordnet_vldb_wide/")
path_binary = Path("data/yadl/binary_update/")
path_open_data = Path("data/open_data_us/")

In [29]:
def table_profile(table_path, data_lake):
    df = pl.read_parquet(table_path)
    n_num = df.select(cs.numeric()).shape[1]
    c_num = df.select(~cs.numeric()).shape[1]
    if len(df)> 0:
        avg_null = df.null_count().mean_horizontal().item()/len(df)
        avg_null = 0
        # avg_null = df.null_count().mean_horizontal().item()/len(df)
    else:
        avg_null = 0
    d = {"data_lake": data_lake, "table": table_path.stem, "num_attr": n_num, "cat_attr": c_num, "n_rows":len(df), "n_cols": len(df.columns), "avg_null":avg_null}

    return d

In [30]:
def get_stats(df: pl.DataFrame):
    return df.select(
        pl.col("data_lake").first(),
        pl.col("n_tables").first(),
        pl.col("num_attr").mean().alias("mean_num_attr"),
        pl.col("num_attr").median().alias("median_num_attr"),
        pl.col("cat_attr").mean().alias("mean_cat_attr"),
        pl.col("cat_attr").median().alias("median_cat_attr"),
        pl.col("n_rows").mean().alias("mean_n_rows"),
        pl.col("n_rows").median().alias("median_n_rows"),
        pl.col("n_cols").mean().alias("mean_n_cols"),
        pl.col("n_cols").median().alias("median_n_cols"),
        pl.col("avg_null").mean().alias("mean_avg_null"),
        pl.col("avg_null").median().alias("median_avg_null"),
    )

In [31]:
stats =[ ]

In [41]:
profiles = []
for tab in path_wordnet_vldb.glob('**/*.parquet'):
    d= table_profile(tab, "vldb")
    profiles.append(d)
df = pl.from_dicts(profiles).with_columns(pl.lit(len(profiles)).alias("n_tables"))
stats.append(get_stats(df))

50112

In [44]:
profiles = []
for tab in tqdm(path_wordnet_vldb_wide.glob('**/*.parquet'), total=sum( 1 for _ in path_wordnet_vldb_wide.glob('**/*.parquet'))):
    d= table_profile(tab, "vldb")
    profiles.append(d)
df = pl.from_dicts(profiles).with_columns(pl.lit(len(profiles)).alias("n_tables"))
stats.append(get_stats(df))

100%|██████████| 50112/50112 [05:24<00:00, 154.57it/s] 


In [17]:
profiles = []
for tab in path_binary.glob('**/*.parquet'):
    d= table_profile(tab, "binary")
    profiles.append(d)
df = pl.from_dicts(profiles)
stats.append(get_stats(df))

In [48]:
profiles = []
for tab in path_open_data.glob('**/*.parquet'):
    d= table_profile(tab, "open_data_us")
    profiles.append(d)
df = pl.from_dicts(profiles).with_columns(pl.lit(len(profiles)).alias("n_tables"))
stats.append(get_stats(df))

In [52]:
profiles = []
for tab in path_wordnet.glob('**/*.parquet'):
    d= table_profile(tab, "wordnet")
    profiles.append(d)
df = pl.from_dicts(profiles).with_columns(pl.lit(len(profiles)).alias("n_tables"))
stats.append(get_stats(df))

In [53]:
df_stats = pl.concat(stats)

In [58]:
df_stats

data_lake,n_tables,mean_num_attr,median_num_attr,mean_cat_attr,median_cat_attr,mean_n_rows,median_n_rows,mean_n_cols,median_n_cols,mean_avg_null,median_avg_null
str,i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""vldb""",869,1.761795,2.0,6.433832,4.0,9220.859609,74.0,8.195627,6.0,0.0,0.0
"""vldb""",869,1.761795,2.0,6.433832,4.0,9220.859609,74.0,8.195627,6.0,0.0,0.0
"""vldb""",50112,3.59842,3.0,9.14278,7.0,24236.18728,1712.0,12.7412,11.0,0.0,0.0
"""open_data_us""",5591,11.097836,3.0,12.759256,7.0,17124.504561,1000.0,23.857092,14.0,0.0,0.0
"""wordnet""",30072,0.389698,0.0,2.775805,3.0,22343.919826,927.0,3.165503,3.0,0.0,0.0


In [60]:
# df_stats.slice(1).to_pandas().transpose().style.format(precision=2)

Unnamed: 0,0,1,2,3
data_lake,vldb,vldb,open_data_us,wordnet
n_tables,869,50112,5591,30072
mean_num_attr,1.76,3.60,11.10,0.39
median_num_attr,2.00,3.00,3.00,0.00
mean_cat_attr,6.43,9.14,12.76,2.78
median_cat_attr,4.00,7.00,7.00,3.00
mean_n_rows,9220.86,24236.19,17124.50,22343.92
median_n_rows,74.00,1712.00,1000.00,927.00
mean_n_cols,8.20,12.74,23.86,3.17
median_n_cols,6.00,11.00,14.00,3.00


In [51]:
df_stats.to_pandas().transpose().style.format(precision=2)

Unnamed: 0,0,1,2,3
data_lake,vldb,vldb,vldb,open_data_us
n_tables,869,869,50112,5591
mean_num_attr,1.76,1.76,3.60,11.10
median_num_attr,2.00,2.00,3.00,3.00
mean_cat_attr,6.43,6.43,9.14,12.76
median_cat_attr,4.00,4.00,7.00,7.00
mean_n_rows,9220.86,9220.86,24236.19,17124.50
median_n_rows,74.00,74.00,1712.00,1000.00
mean_n_cols,8.20,8.20,12.74,23.86
median_n_cols,6.00,6.00,11.00,14.00


In [52]:
df_stats.with_columns(pl.Series(["binary", "open_data", "wordnet"]).alias("data_lake")).transpose(include_header=True, column_names="data_lake").write_csv("stats_data_lakes.csv")