In [1]:
cd ~/bench

/home/soda/rcappuzz/work/benchmark-join-suggestions


In [2]:
import polars as pl
from pathlib import Path
import polars.selectors as cs

In [3]:
path_wordnet = Path("data/yadl/wordnet_full/")
path_binary = Path("data/yadl/binary_update/")
path_open_data = Path("data/open_data_us/")

In [4]:
def table_profile(table_path):
    df = pl.read_parquet(table_path)
    n_num = df.select(cs.numeric()).shape[1]
    c_num = df.select(~cs.numeric()).shape[1]
    if len(df)> 0:
        avg_null = df.null_count().mean_horizontal().item()/len(df)
    else:
        avg_null = 0
    d = {"table": table_path.stem, "num_attr": n_num, "cat_attr": c_num, "n_rows":len(df), "n_cols": len(df.columns), "avg_null":avg_null}

    return d

In [5]:
def get_stats(df: pl.DataFrame):
    return df.select(
        pl.lit("binary").alias("data_lake"),
        pl.col("num_attr").mean().alias("mean_num_attr"),
        pl.col("num_attr").median().alias("median_num_attr"),
        pl.col("cat_attr").mean().alias("mean_cat_attr"),
        pl.col("cat_attr").median().alias("median_cat_attr"),
        pl.col("n_rows").mean().alias("mean_n_rows"),
        pl.col("n_rows").median().alias("median_n_rows"),
        pl.col("n_cols").mean().alias("mean_n_cols"),
        pl.col("n_cols").median().alias("median_n_cols"),
        pl.col("avg_null").mean().alias("mean_avg_null"),
        pl.col("avg_null").median().alias("median_avg_null"),
    )

In [6]:
stats =[ ]

In [7]:
profiles = []
for tab in path_binary.glob('**/*.parquet'):
    d= table_profile(tab)
    profiles.append(d)
df = pl.from_dicts(profiles)
stats.append(get_stats(df))

In [9]:
profiles = []
for tab in path_open_data.glob('**/*.parquet'):
    d= table_profile(tab)
    profiles.append(d)
df = pl.from_dicts(profiles)
stats.append(get_stats(df))

KeyboardInterrupt: 

In [10]:
profiles = []
for tab in path_wordnet.glob('**/*.parquet'):
    d= table_profile(tab)
    profiles.append(d)
df = pl.from_dicts(profiles)
stats.append(get_stats(df))

In [11]:
df_stats = pl.concat(stats)

In [12]:
df_stats

data_lake,mean_num_attr,median_num_attr,mean_cat_attr,median_cat_attr,mean_n_rows,median_n_rows,mean_n_cols,median_n_cols,mean_avg_null,median_avg_null
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""binary""",0.3,0.0,1.7,2.0,287134.328571,40407.5,2.0,2.0,4e-06,0.0
"""binary""",0.389698,0.0,2.775805,3.0,22343.919826,927.0,3.165503,3.0,0.309895,0.331624


In [52]:
df_stats.with_columns(pl.Series(["binary", "open_data", "wordnet"]).alias("data_lake")).transpose(include_header=True, column_names="data_lake").write_csv("stats_data_lakes.csv")