Filtering bad candidates
===
In this notebook I will be filtering candidate datasets to remove those that appear to be improperly matched according to different metrics.

I will start with two metrics.
 - Join column cardinality: if the cardinality of a column is lower than a certain threshold (relative to the size of the dataset), it will be dropped. 
 - Left join size: if the size of the table that would be the result of a left join is larger than a certain threshold, then the candidate will be dropped. 

In [8]:
%cd /home/soda/rcappuzz/work/study-data-lakes/

/home/soda/rcappuzz/work/study-data-lakes


In [9]:
%load_ext autoreload
%autoreload 2
import pandas as pd 
import polars as pl
import json
from pathlib import Path
from src.utils import profile_dataset
from tqdm import tqdm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
# Setting working folder
working_folder = Path(".")
root_data_folder = Path(working_folder, "data")
data_folder = Path(root_data_folder, "soda-data-lake/a-d3m/a-d3m_small")
assert data_folder.exists()

## Prepare the setup for a single dataset
1. Read the original dataset
2. Read the candidates
3. Compute the metrics

In [11]:
dataset_list = list(data_folder.iterdir())

df_info_overall = pd.DataFrame()

for dataset_path in tqdm(dataset_list, total=len(dataset_list)):
    df_info = profile_dataset(dataset_path=dataset_path, engine="polars")
    df_info_overall = pd.concat([df_info_overall, df_info])

100%|██████████| 8/8 [00:24<00:00,  3.06s/it]


In [13]:
df_info_clean = df_info_overall.dropna()
df_info_clean.to_csv(
    "info_a-d3m_small.csv", index=False
)

In [195]:
df_info_clean.loc[df_info_clean["scale_factor"] > 1]

Unnamed: 0,ds_name,candidate_name,left_on,right_on,merged_rows,scale_factor,left_unique_keys,right_unique_keys,left_rows,left_cols
16,LL0_200_pbc,datamart.zenodo.6974483.3d9901cd-8a55-48ee-b2f...,[Z14],[p14],161442.0,386.22488,147.0,80.0,418.0,20.0
17,LL0_200_pbc,datamart.socrata.data-usaid-gov.mm3i-pcnd,[Z17],[b17],93585.0,223.88756,5.0,8.0,418.0,20.0
18,LL0_200_pbc,datamart.socrata.data-usaid-gov.ibtf-2a34,[Z17],[b17],70992.0,169.837321,5.0,8.0,418.0,20.0
19,LL0_200_pbc,datamart.socrata.data-usaid-gov.4gye-9wpi,[Z17],[q17],3960.0,9.473684,5.0,17.0,418.0,20.0
0,LL1_736_population_spawn_MIN_METADATA,datamart.socrata.www-data-act-gov-au.c5h9-9bh7,[day],[daly],614041.0,20.293509,320.0,4318.0,30258.0,5.0
3,LL1_736_population_spawn_MIN_METADATA,datamart.zenodo.4243547.ece93f17-c7b0-48a2-b05...,[day],[Day],929658.0,30.72437,320.0,305.0,30258.0,5.0
4,LL1_736_population_spawn_MIN_METADATA,datamart.socrata.www-data-act-gov-au.65z9-5mfa,[day],[daly],102992.0,3.403794,320.0,549.0,30258.0,5.0
9,LL1_736_population_spawn_MIN_METADATA,datamart.zenodo.6621762.6876a8dd-25e0-4c4b-918...,[day],[day],1879351.0,62.11088,320.0,295.0,30258.0,5.0
11,LL1_736_population_spawn_MIN_METADATA,datamart.zenodo.5021480.47623c54-e1a8-4716-b80...,[day],[day],34893.0,1.153183,320.0,238.0,30258.0,5.0
19,LL1_736_population_spawn_MIN_METADATA,datamart.socrata.opendata-utah-gov.7iu3-rv9s,[day],[May],30741.0,1.015963,320.0,47.0,30258.0,5.0


## Profiling all datasets

In [15]:
# Setting working folder
working_folder = Path(".")
root_data_folder = Path(working_folder, "data")
data_folder = Path(root_data_folder, "soda-data-lake/a-d3m/a-d3m_full")
assert data_folder.exists()

In [16]:
dataset_list = list(data_folder.iterdir())

df_info_overall = pd.DataFrame()

for dataset_path in tqdm(dataset_list, total=len(dataset_list)):
    df_info = profile_dataset(dataset_path=dataset_path, engine="polars")
    df_info_overall = pd.concat([df_info_overall, df_info])

  1%|          | 2/391 [00:04<11:20,  1.75s/it]

In [None]:
df_info_clean = df_info_overall.dropna()
df_info_clean.to_csv(
    "info_a-d3m_full.csv", index=False
)

## Timing different engines

In [None]:
%%timeit
dataset_list = list(data_folder.iterdir())

df_info_overall = pd.DataFrame()

for dataset_path in tqdm(dataset_list, total=len(dataset_list)):
    df_info = profile_dataset(dataset_path=dataset_path, engine="polars")
    df_info_overall = pd.concat([df_info_overall, df_info])

In [None]:
%%timeit
dataset_list = list(data_folder.iterdir())

df_info_overall = pd.DataFrame()

for dataset_path in tqdm(dataset_list, total=len(dataset_list)):
    df_info = profile_dataset(dataset_path=dataset_path, engine="pandas")
    df_info_overall = pd.concat([df_info_overall, df_info])

### Testing different grouping methods

In [63]:
key_cols = ["acceptedScientificName_gbifP", "country_rapid"]
df_pd = pl.read_csv(right_table_path, infer_schema_length=0).to_pandas()
df_pl = pl.read_csv(right_table_path, infer_schema_length=0)
df_pl_lazy = pl.scan_csv(right_table_path, infer_schema_length=0)


In [83]:
%%timeit
df_pl.select(pl.col(key_cols)).groupby(key_cols).count()

11.9 ms ± 124 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [84]:
%%timeit
df_pd[key_cols].groupby(key_cols).size()

13.3 ms ± 31.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [85]:
%%timeit
df_pl_lazy.groupby(by=key_cols).count().collect()

669 ms ± 117 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [86]:
%%timeit
df_pl.lazy().groupby(by=key_cols).count().collect()

11.6 ms ± 124 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### More