In [5]:
cd ~/bench

/home/soda/rcappuzz/work/benchmark-join-suggestions


In [6]:
import polars as pl
from pathlib import Path
import polars.selectors as cs
from tqdm.notebook import tqdm
from joblib import Parallel, delayed

In [7]:
path_list = list(
    map(
        Path,
        [
            # "data/yadl/binary_update/",
            # "data/yadl/wordnet_vldb/",
            # "data/yadl/wordnet_full/",
            # "data/yadl/wordnet_vldb_3/",
            # "data/yadl/wordnet_vldb_10/",
            # "data/yadl/wordnet_vldb_50/",
            "data/open_data_us/",
        ],
    )
)

In [8]:
def table_profile(table_path, data_lake):
    df = pl.read_parquet(table_path)
    n_num = df.select(cs.numeric()).shape[1]
    c_num = df.select(~cs.numeric()).shape[1]
    if len(df) > 0:
        avg_null = df.null_count().mean_horizontal().item() / len(df)
        # avg_null = df.null_count().mean_horizontal().item()/len(df)
    else:
        avg_null = 0
    d = {
        "data_lake": data_lake,
        "table": table_path.stem,
        "num_attr": n_num,
        "cat_attr": c_num,
        "n_rows": len(df),
        "n_cols": len(df.columns),
        "avg_null": avg_null,
    }

    return d

In [9]:
def get_stats(df: pl.DataFrame):
    return df.select(
        pl.col("data_lake").first(),
        pl.col("n_tables").first(),
        pl.col("n_rows").sum().alias("tot_rows"),
        pl.col("n_cols").sum().alias("tot_cols"),
        pl.col("n_cols").mean().alias("mean_n_cols"),
        pl.col("n_cols").median().alias("median_n_cols"),
        pl.col("n_rows").mean().alias("mean_n_rows"),
        pl.col("n_rows").median().alias("median_n_rows"),
        pl.col("num_attr").mean().alias("mean_num_attr"),
        pl.col("num_attr").median().alias("median_num_attr"),
        pl.col("cat_attr").mean().alias("mean_cat_attr"),
        pl.col("cat_attr").median().alias("median_cat_attr"),
        pl.col("avg_null").mean().alias("mean_avg_null"),
        pl.col("avg_null").median().alias("median_avg_null"),
    )

In [10]:
stats = []

In [15]:
for path in path_list:
    profiles = []
    # for tab in path.glob("**/*.parquet"):

    profiles = Parallel(n_jobs=8, verbose=0)(
        delayed(table_profile)(tab, path.stem)
        for tab in tqdm(
            path.glob("**/*.parquet"),
            total=sum(1 for _ in path.glob("**/*.parquet")),
            position=0,
            leave=False,
            desc=path.stem,
        )
    )

    # for tab in tqdm(
    #     path.glob("**/*.parquet"), total=sum(1 for _ in path.glob("**/*.parquet")), position=0, leave=False, desc=path.stem
    # ):

    #     d = table_profile(tab, path.stem)
    #     profiles.append(d)
    df = pl.from_dicts(profiles).with_columns(pl.lit(len(profiles)).alias("n_tables"))
    stats.append(get_stats(df))
df_stats = pl.concat(stats)

binary_update:   0%|          | 0/70 [00:00<?, ?it/s]

wordnet_vldb:   0%|          | 0/869 [00:00<?, ?it/s]

wordnet_full:   0%|          | 0/30072 [00:00<?, ?it/s]

wordnet_vldb_3:   0%|          | 0/3162 [00:00<?, ?it/s]

wordnet_vldb_10:   0%|          | 0/10059 [00:00<?, ?it/s]

wordnet_vldb_50:   0%|          | 0/47223 [00:00<?, ?it/s]

open_data_us:   0%|          | 0/5591 [00:00<?, ?it/s]

In [16]:
df_stats = pl.concat(stats)

In [18]:
df_stats.transpose(include_header=True, column_names="data_lake").to_pandas().style.format(precision=2)

Unnamed: 0,column,binary_update,wordnet_vldb,wordnet_full,wordnet_vldb_3,wordnet_vldb_10,wordnet_vldb_50,open_data_us
0,n_tables,70.0,869.0,30072.0,3162.0,10059.0,47223.0,5591.0
1,tot_rows,20099403.0,8012927.0,671926357.0,73449573.0,241564466.0,1200891362.0,95743105.0
2,tot_cols,140.0,7122.0,95193.0,38553.0,126999.0,623685.0,133385.0
3,mean_n_cols,2.0,8.2,3.17,12.19,12.63,13.21,23.86
4,median_n_cols,2.0,6.0,3.0,10.0,10.0,11.0,14.0
5,mean_n_rows,287134.33,9220.86,22343.92,23228.83,24014.76,25430.22,17124.5
6,median_n_rows,40407.5,74.0,927.0,1602.0,1698.0,1767.0,1000.0
7,mean_num_attr,0.3,1.76,0.39,3.5,3.54,3.59,11.1
8,median_num_attr,0.0,2.0,0.0,3.0,3.0,3.0,3.0
9,mean_cat_attr,1.7,6.43,2.78,8.69,9.08,9.61,12.76


In [19]:
df_stats.transpose(include_header=True, column_names="data_lake").write_csv(
    "stats_data_lakes.csv"
)