In [91]:

import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import polars as pl
from pathlib import Path
import numpy as np
import re
import src.yago.utils as utils
import src.plotting.plotting_utils as plotting_utils
import os


In [3]:
%cd /home/soda/rcappuzz/work/prepare-data-lakes/

/home/soda/rcappuzz/work/prepare-data-lakes


In [69]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
pattern = r"(yago_seltab_<)([a-zA-Z_]+)[_0-9]*>"
replacement = r"\g<2>"
def clean_keys(type_name):
    return re.sub(pattern, replacement, type_name)

In [7]:
re.sub(pattern, replacement, "yago_seltab_<wikicat_Living_people>")

'wikicat_Living_people'

# Profile tables

## Profile Type tables

In [8]:
src_path = Path("data/yago3-dl/seltab/")

In [9]:
tab_paths = list(src_path.iterdir())
first_tab = pl.read_parquet(tab_paths[0])

In [13]:
tab_paths

[PosixPath('data/yago3-dl/seltab/yago_seltab_wordnet_administrative_district.parquet'),
 PosixPath('data/yago3-dl/seltab/csv'),
 PosixPath('data/yago3-dl/seltab/yago_seltab_wordnet_military_unit.parquet'),
 PosixPath('data/yago3-dl/seltab/yago_seltab_wordnet_organization.parquet'),
 PosixPath('data/yago3-dl/seltab/yago_seltab_wordnet_government.parquet'),
 PosixPath('data/yago3-dl/seltab/yago_seltab_wordnet_artist.parquet'),
 PosixPath('data/yago3-dl/seltab/yago_seltab_wordnet_agency.parquet'),
 PosixPath('data/yago3-dl/seltab/yago_seltab_wikicat_Musical_groups_from_London.parquet'),
 PosixPath('data/yago3-dl/seltab/yago_seltab_wordnet_country.parquet'),
 PosixPath('data/yago3-dl/seltab/yago_seltab_wordnet_company.parquet'),
 PosixPath('data/yago3-dl/seltab/yago_seltab_wordnet_airport.parquet'),
 PosixPath('data/yago3-dl/seltab/yago_seltab_wordnet_person.parquet'),
 PosixPath('data/yago3-dl/seltab/csv_cleaned')]

In [14]:
tabs = {}
for tpath in tab_paths:
    if tpath.is_dir():
        continue
    tname = tpath.stem
    tab = pl.read_parquet(tpath)
    tabs[tname] = tab


In [15]:
list(tabs.keys())

['yago_seltab_wordnet_administrative_district',
 'yago_seltab_wordnet_military_unit',
 'yago_seltab_wordnet_organization',
 'yago_seltab_wordnet_government',
 'yago_seltab_wordnet_artist',
 'yago_seltab_wordnet_agency',
 'yago_seltab_wikicat_Musical_groups_from_London',
 'yago_seltab_wordnet_country',
 'yago_seltab_wordnet_company',
 'yago_seltab_wordnet_airport',
 'yago_seltab_wordnet_person']

# Slicing tables

In [17]:
from itertools import combinations

In [228]:
def explode_table(tgt_table, table_name, comb_size=2, min_occurrences=100):
    dir_path = Path("data/yago3-dl/seltab/subtabs/", table_name)
    os.makedirs(dir_path, exist_ok=True)
    # Ignore columns `type` and `subject` 
    target_columns = tgt_table.columns[2:]
    coords_dict = {}
    
    # Counting the number of non-null occurrences for each combination of size `comb_size`
    for comb in combinations(target_columns, comb_size):
        tt=tgt_table.select(
                pl.all(pl.col(comb).is_not_null())
            ).sum().item()
        coords_dict[comb] = tt

    df_coord = pd.DataFrame().from_dict(coords_dict, orient='index', columns=["count"])
    df_coord = df_coord.reset_index()

    # selecting only combinations with more than `min_occurrences` occs
    rich_combs = df_coord[df_coord["count"]>=min_occurrences]

    # For each comb, write a new parquet file. 
    for _, comb in rich_combs.iterrows():
        sel_col = ["type", "subject"] + list(comb["index"])
        res = tgt_table.filter(
            pl.all(pl.col(comb["index"]).is_not_null())
        ).select(pl.col(sel_col)).unique()
        
        filename = "_".join(table_name.split("_")[2:]) + "_" + "_".join(comb["index"])
        dest_path = Path(dir_path, filename + ".parquet")
        print(dest_path)
        res.write_parquet(dest_path)
        break

In [229]:
tab_name = "yago_seltab_wordnet_person"
test_table = tabs[tab_name]

In [230]:
explode_table(tab, tab_name)

data/yago3-dl/seltab/subtabs/yago_seltab_wordnet_person/wordnet_person_actedIn_wasBornIn.parquet


In [216]:
for tab_name, tab in tabs.items():
    explode_table(tab, tab_name)

data/yago3-dl/seltab/subtabs/yago_seltab_wordnet_administrative_district/wordnet_administrative_district_hasCapital_hasWebsite.parquet
data/yago3-dl/seltab/subtabs/yago_seltab_wordnet_administrative_district/wordnet_administrative_district_hasCapital_hasExpenses.parquet
data/yago3-dl/seltab/subtabs/yago_seltab_wordnet_administrative_district/wordnet_administrative_district_hasCapital_hasMotto.parquet
data/yago3-dl/seltab/subtabs/yago_seltab_wordnet_administrative_district/wordnet_administrative_district_hasCapital_owns.parquet
data/yago3-dl/seltab/subtabs/yago_seltab_wordnet_administrative_district/wordnet_administrative_district_hasWebsite_hasExpenses.parquet
data/yago3-dl/seltab/subtabs/yago_seltab_wordnet_administrative_district/wordnet_administrative_district_hasWebsite_hasMotto.parquet
data/yago3-dl/seltab/subtabs/yago_seltab_wordnet_administrative_district/wordnet_administrative_district_hasWebsite_wasDestroyedOnDate.parquet
data/yago3-dl/seltab/subtabs/yago_seltab_wordnet_admini

In [158]:
# Considering all columns except for type and subject
target_columns = test_table.columns[2:]

In [124]:
# Evaluating all pairs of columns %
coords_dict = {}
for pair in combinations(target_columns, 2):
    c1, c2= pair
    tt=test_table.select(
        ((pl.col(c1).is_not_null()) & 
         (pl.col(c2).is_not_null()) 
         ).sum(),
    ).item()
    coords_dict[pair] = tt

In [125]:
df_coord = pd.DataFrame().from_dict(coords_dict, orient='index', columns=["count"])
df_coord = df_coord.reset_index()
df_coord[["left", "right"]]= df_coord["index"].to_list()
df_coord = df_coord[["left", "right", "count"]]

In [126]:
ranking = df_coord.sort_values("count", ascending=False)
ranking[ranking["count"]>100]

Unnamed: 0,left,right,count
15,wasBornIn,wasBornOnDate,2320275
25,playsFor,wasBornOnDate,1174108
12,wasBornIn,playsFor,1107410
4,actedIn,wasBornOnDate,396136
0,actedIn,wasBornIn,386205
56,wasBornOnDate,graduatedFrom,250470
22,wasBornIn,graduatedFrom,225093
54,wasBornOnDate,hasWeight,63108
11,actedIn,graduatedFrom,59635
20,wasBornIn,hasWeight,50257


In [None]:
plotting_utils.plot_pairwise_heatmap(df_coord, lognorm=True)

In [128]:
# Evaluating all triplets
coords_dict = {}
for comb in combinations(target_columns, 3):
    c1, c2, c3 = comb
    tt=test_table.select(
        ((pl.col(c1).is_not_null()) & 
         (pl.col(c2).is_not_null()) &
         (pl.col(c3).is_not_null()) 
         ).sum(),
    ).item()
    coords_dict[comb] = tt
df_coord = pd.DataFrame().from_dict(coords_dict, orient='index', columns=["count"])
df_coord = df_coord.reset_index()
rich_triplets = df_coord[df_coord["count"]>=100]

In [154]:
for idx,triplet in rich_triplets.iterrows():
    sel_col = ["type", "subject"] + list(triplet["index"])
    res = test_table.filter(
        pl.all(pl.col(triplet["index"]).is_not_null())
    ).select(pl.col(sel_col)).unique()


In [162]:
import os

In [165]:
os.makedirs("data/yago3-dl/seltab/subtabs", exist_ok=True)

In [239]:
list(Path("data/yago3-dl/seltab/subtabs/").glob("**/*.parquet"))

[PosixPath('data/yago3-dl/seltab/subtabs/yago_seltab_wordnet_person/wordnet_person_actedIn_wasBornIn.parquet'),
 PosixPath('data/yago3-dl/seltab/subtabs/yago_seltab_wordnet_person/wordnet_person_actedIn_playsFor.parquet'),
 PosixPath('data/yago3-dl/seltab/subtabs/yago_seltab_wordnet_person/wordnet_person_actedIn_edited.parquet'),
 PosixPath('data/yago3-dl/seltab/subtabs/yago_seltab_wordnet_person/wordnet_person_actedIn_wasBornOnDate.parquet'),
 PosixPath('data/yago3-dl/seltab/subtabs/yago_seltab_wordnet_person/wordnet_person_actedIn_graduatedFrom.parquet'),
 PosixPath('data/yago3-dl/seltab/subtabs/yago_seltab_wordnet_person/wordnet_person_wasBornIn_playsFor.parquet'),
 PosixPath('data/yago3-dl/seltab/subtabs/yago_seltab_wordnet_person/wordnet_person_wasBornIn_edited.parquet'),
 PosixPath('data/yago3-dl/seltab/subtabs/yago_seltab_wordnet_person/wordnet_person_wasBornIn_wasBornOnDate.parquet'),
 PosixPath('data/yago3-dl/seltab/subtabs/yago_seltab_wordnet_person/wordnet_person_wasBornIn_p