In [1]:
%cd /home/soda/rcappuzz/work/prepare-data-lakes

/home/soda/rcappuzz/work/prepare-data-lakes


In [2]:
import src.yago.utils as utils

In [3]:
import re
from pathlib import Path

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
from tqdm import tqdm
import os
import polars.selectors as cs
import random

from sklearn.utils import murmurhash3_32

sns.set_theme("paper", style="whitegrid")

In [4]:
cfg = pl.Config()
cfg.set_fmt_str_lengths(150)

polars.config.Config

In [5]:
max_fields = 2
base_path = Path("data/yago3-dl/wordnet_vldb")

In [6]:
list_stats = []
for path in Path(base_path).glob("*.parquet"):
    this_df = pl.read_parquet(path)
    rows, cols = this_df.shape
    n_num = this_df.select(cs.numeric()).shape[1]
    c_num = this_df.select(~cs.numeric()).shape[1]

    list_stats.append(
        {
            "rows": rows,
            "cols": cols,
            "n_num": n_num,
            "c_num": c_num,
            "size": path.stat().st_size,
        }
    )
df_stats = pl.from_dicts(list_stats)

In [7]:
df_stats.filter((pl.col("cols") > 8))

rows,cols,n_num,c_num,size
i64,i64,i64,i64,i64
1924,12,7,5,44718
2004,9,2,7,51348
17979,12,5,7,347412
64883,15,5,10,1849044
4276,10,4,6,57275
782,10,4,6,20690
8536,15,4,11,141852
2446,21,2,19,65263
67,14,0,14,6151
173390,34,17,17,1142554


In [8]:
df_stats = df_stats.with_columns((pl.col("rows") * pl.col("cols")).alias("cells"))

In [9]:
df_stats.filter(pl.col("cols") > 8).select(
    pl.col("rows").median().alias("row_median"),
    pl.col("cols").median().alias("col_median"),
    pl.col("cells").median().alias("cell_median"),
    pl.col("rows").mean().alias("row_mean"),
    pl.col("rows").quantile(0.80).alias("row_qle"),
    pl.col("cols").mean().alias("col_mean"),
    pl.col("cols").quantile(0.80).alias("col_qle"),
    pl.col("cells").mean().alias("cell_mean"),
    pl.col("cells").quantile(0.80).alias("cell_qle"),
    pl.len(),
)

row_median,col_median,cell_median,row_mean,row_qle,col_mean,col_qle,cell_mean,cell_qle,len
f64,f64,f64,f64,f64,f64,f64,f64,f64,u32
1602.0,14.0,21380.0,28057.108303,9644.0,15.945848,22.0,969297.974729,171990.0,277


In [10]:
df_stats

rows,cols,n_num,c_num,size,cells
i64,i64,i64,i64,i64,i64
528,6,2,4,12954,3168
1924,12,7,5,44718,23088
7,5,2,3,1752,35
1540,8,5,3,35791,12320
8,4,2,2,1458,32
256,5,1,4,6165,1280
2004,9,2,7,51348,18036
1,2,0,2,769,2
6,3,0,3,1196,18
146,5,0,5,4037,730


In [11]:
def estimate_size(
    df_stats,
    resample_rows,
    resample_columns,
    row_sample_frac=0.7,
    min_rows=100,
    min_cols=8,
):
    df_all = df_stats.filter((pl.col("cols") > min_cols))
    df_num = df_stats.filter((pl.col("cols") > min_cols) & (pl.col("n_num") >= 2))
    
    
    # Total size considering all columns
    tot_size_all = (
        # select only the tables with enough columns
        df_stats.filter((pl.col("cols") > min_cols))
        .with_columns(
            # the expected number of rows is given by the starting number of rows 
            # + the sample fraction x the number of resamplings
            exp_rows=pl.col("rows") * ((1 + resample_rows * row_sample_frac)),
            # the expected number of columns is the base number of columns - 1 x the number of col resamplings
            
            exp_cols=((pl.col("cols") + pl.col("cols") - 2) / 2) * resample_columns,
            # exp_cols=((pl.col("cols") + pl.col("cols") - 2) / 2),
        )
        # select only the rows that have enough rows 
        .filter((pl.col("exp_rows") > min_rows))
        .with_columns(
            # the expected number of cells is the product of the expected values 
            # measured above
            exp_cells=pl.col("exp_rows") * pl.col("exp_cols"),
            # the average cell size is given by the known size x the known number of cells
            cell_size=pl.col("size") / pl.col("cells"),
        )
        # the expected size is the measured cell size x the number of expected cells 
        .with_columns(exp_size=pl.col("cell_size") * pl.col("exp_cells"))
        # find the total expected size by multiplying the expected size by the number of column 
        .select((pl.col("exp_size") ).sum())
        # .select((pl.col("exp_size") * resample_columns).sum())
    ).item()

    # base operations are the same, but I am adding a filter 
    tot_size_num = (
        df_stats.filter((pl.col("cols") > 8))
        # select only those tables that have at least 2 numerical columns
        .filter(pl.col("n_num") >= 2)
        .with_columns(
            exp_rows=pl.col("rows") * ((1 + resample_rows * row_sample_frac)),
            min_sample_size=pl.when(pl.col("n_num") > 2)
            .then(pl.col("n_num") - 2)
            .otherwise(2),
            max_sample_size=pl.col("n_num"),
        )
        .filter((pl.col("exp_rows") > 100))
        .with_columns(
            exp_cols=(pl.col("max_sample_size") + pl.col("min_sample_size")) / 2 * resample_columns,
        )
        .with_columns(
            exp_cells=pl.col("exp_rows") * pl.col("exp_cols"),
            cell_size=pl.col("size") / pl.col("cells"),
        )
        .with_columns(exp_size=pl.col("cell_size") * pl.col("exp_cells"))
        .select((pl.col("exp_size") ).sum())
    ).item()

    tot_size = tot_size_all + tot_size_num
    return tot_size

In [12]:
min_rows =50
min_columns = 8
resample_rows = 2

for rc in [1, 3, 5, 10, 30, 50, 100]:
    tot_size = estimate_size(df_stats, resample_rows, rc)
    
    n_all_tables = len(
    df_stats
    .filter((pl.col("rows") > min_rows) & (pl.col("cols") > min_columns))
    )
    n_num_tables = len(
        df_stats
        .filter((pl.col("rows") > min_rows) & (pl.col("cols") > min_columns))
        .filter(
            pl.col("n_num") > 2
        )
    )
    tot_tables = (n_num_tables + n_all_tables) * (resample_rows + 1) * rc
    
    print(f"##### Number of subtables: {rc} - Resamplings by subtable: {resample_rows} ")
    print(f"Approximate size: {tot_size/1e9:.2f} GB")
    print(f"Approximate number of tables: {tot_tables}")

##### Number of subtables: 1 - Resamplings by subtable: 2 
Approximate size: 0.51 GB
Approximate number of tables: 1176
##### Number of subtables: 3 - Resamplings by subtable: 2 
Approximate size: 1.52 GB
Approximate number of tables: 3528
##### Number of subtables: 5 - Resamplings by subtable: 2 
Approximate size: 2.53 GB
Approximate number of tables: 5880
##### Number of subtables: 10 - Resamplings by subtable: 2 
Approximate size: 5.06 GB
Approximate number of tables: 11760
##### Number of subtables: 30 - Resamplings by subtable: 2 
Approximate size: 15.17 GB
Approximate number of tables: 35280
##### Number of subtables: 50 - Resamplings by subtable: 2 
Approximate size: 25.28 GB
Approximate number of tables: 58800
##### Number of subtables: 100 - Resamplings by subtable: 2 
Approximate size: 50.56 GB
Approximate number of tables: 117600
