In [43]:
%cd /home/soda/rcappuzz/work/prepare-data-lakes

/home/soda/rcappuzz/work/prepare-data-lakes


In [2]:
import src.yago.utils as utils

In [3]:
import re
from pathlib import Path

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
from tqdm import tqdm
import os
from itertools import combinations

sns.set_theme("paper", style="whitegrid")

In [4]:
cfg = pl.Config()
cfg.set_fmt_str_lengths(150)

polars.config.Config

In [5]:
yago_path = Path("/storage/store3/work/jstojano/yago3/")
facts1_path = Path(yago_path, "facts_parquet/yago_updated_2022_part1")
facts2_path = Path(yago_path, "facts_parquet/yago_updated_2022_part2")
fname = "yagoTypes"
yagotypes_path = Path(facts1_path, f"{fname}.tsv.parquet")
df_types = utils.import_from_yago(yagotypes_path)
fname = "yagoFacts"
yagofacts_path = Path(facts2_path, f"{fname}.tsv.parquet")
yagofacts = utils.import_from_yago(yagofacts_path)
fname = "yagoLiteralFacts"
yagoliteralfacts_path = Path(facts2_path, f"{fname}.tsv.parquet")
yagoliteralfacts = utils.import_from_yago(yagoliteralfacts_path)
fname = "yagoDateFacts"
yagodatefacts_path = Path(facts2_path, f"{fname}.tsv.parquet")
yagodatefacts = utils.import_from_yago(yagodatefacts_path)
yagofacts = yagofacts.drop("num_object")
yagodatefacts = (
    yagodatefacts.with_columns(
        pl.col("cat_object")
        .str.split("^^")
        .list.first()
        .str.to_datetime(strict=False)
        .dt.date()
        .cast(pl.Utf8)
        .alias("cat_object")
    )
    .drop_nulls("cat_object")
    .drop("num_object")
)
yagoliteralfacts = yagoliteralfacts.with_columns(
    pl.when(pl.col("num_object").is_not_null())
    .then(pl.col("num_object"))
    .otherwise(pl.col("cat_object"))
    .alias("cat_object")
).drop("num_object")
df_facts = pl.concat([yagofacts, yagoliteralfacts, yagodatefacts]).drop("id")

In [6]:
wikilabels_path = Path(
    yago_path, "facts_parquet/yago_updated_2022_part2/wikipediaLabels.tsv.parquet"
)
wikipedia_labels = utils.import_from_yago(wikilabels_path)

# Binary tables

In [None]:
df_types.filter(pl.col("cat_object").str.starts_with("<wordnet_")).unique("cat_object")

id,subject,predicate,cat_object,num_object
str,str,str,str,f64
"""<id_JI8AsQ!PaG_KCM_EAgj4NkKcn>""","""<Samsun–Ceyhan_pipeline>""","""rdf:type""","""<wordnet_grapevine_107223635>""",
"""<id_Fzy5lex?CQ_KCM_OPKOJq?nv?>""","""<Integrated_Authority_File>""","""rdf:type""","""<wordnet_identifier_107270601>""",
"""<id_sPDFWJYYVp_KCM_3GUpyK13nw>""","""<Bangladesh_Institute_of_Development_Studies>""","""rdf:type""","""<wordnet_agency_108337324>""",
"""<id_?bKQnNnU9A_KCM_a!jbWAuqtm>""","""<Weekend_Wogan>""","""rdf:type""","""<wordnet_radio_106277135>""",
"""<id_c0zIlzNkI1_KCM_XkTmKAWM0l>""","""<The_World's_Work>""","""rdf:type""","""<wordnet_magazine_106595351>""",
"""<id_76vA?Xuq2P_KCM_U3WpDsXdJX>""","""<Royal_Belgium_Yachting_Federation>""","""rdf:type""","""<wordnet_federation_108303504>""",
"""<id_veJ9evHLQ4_KCM_2Kde5F!SKd>""","""<\u0022Ode-to-Napoleon\u0022_hexachord>""","""rdf:type""","""<wordnet_chord_113874927>""",
"""<id_Bcc9DygMmG_KCM_Rd6HR5MqSL>""","""<Fishing_industry_in_South_Korea>""","""rdf:type""","""<wordnet_fishery_103350880>""",
"""<id_CwoHivXDu0_KCM_j94lM4e10S>""","""<Rote_Zora_(group)>""","""rdf:type""","""<wordnet_terrorist_organization_108392137>""",
"""<id_K52wdYzc6s_KCM_EMM6ynnqog>""","""<Zorlu_Energy_Wind_Power_Project>""","""rdf:type""","""<wordnet_wind_farm_104586761>""",


In [9]:
import re

pattern = re.compile(r"<{1}([a-zA-Z0-9]+)>{1}")
m = re.sub(pattern, "\\1", "<isLeaderOf>")
m

'isLeaderOf'

In [10]:
dest_path = Path("data/yago3-dl/binary_update")

In [37]:
import re

for gname, group in df_facts.group_by("predicate"):
    new_df = None
    col_name = gname.replace("<", "").replace(">", "")
    dff = group.clone()
    try:
        dff = dff.with_columns(pl.col("cat_object").cast(pl.Float64))
    except pl.ComputeError:
        pass
    new_df = dff.with_columns(
        pl.col("subject"), pl.col("cat_object").alias(col_name)
    ).select(pl.col("subject"), pl.col(col_name))
    df_name = f"binary-{col_name}.parquet"
    new_df.write_parquet(Path(dest_path, df_name))

# Full tables

In [7]:
subjects_with_wordnet = (
    df_types.filter(pl.col("cat_object").str.starts_with("<wordnet_"))
    .select(pl.col("subject"), pl.col("cat_object"))
    .rename({"cat_object": "type"})
)
n_groups = len(subjects_with_wordnet.unique("type"))

In [8]:
def clean_string(string_to_clean):
    pattern = re.compile(r"<{1}([a-zA-Z0-9_]+)>{1}")
    m = re.sub(pattern, "\\1", string_to_clean)
    return m

In [9]:
max_fields = 2
dest_path = Path("data/yago3-dl/wordnet_full")

In [200]:
for this_type, this_df in tqdm(subjects_with_wordnet.group_by("type"), total=n_groups):
    clean_type = clean_string(this_type)
    joined_df = df_facts.join(this_df.select(pl.col("subject", "type")), on="subject")
    if len(joined_df) == 0:
        continue
    base_df = joined_df.select(pl.col("subject").unique()).lazy()
    for idx, grp in joined_df.group_by(by=["predicate"]):
        this_predicate = clean_string(idx[0])
        grp = (
            grp.group_by("subject")
            .agg(pl.all())
            .select(
                pl.col("subject"),
                pl.col("cat_object")
                .list.head(max_fields)
                .list.to_struct(
                    fields=[f"{this_predicate}_{_i}" for _i in range(max_fields)]
                )
                .alias(this_predicate),
            )
            .unnest(this_predicate)
        )
        for col in grp.columns:
            try:
                grp = grp.with_columns(pl.col(col).cast(pl.Float64))
            except pl.ComputeError:
                pass
        grp = grp.lazy()
        base_df = base_df.join(grp, on="subject", how="left")
    base_df = base_df.rename({"subject": clean_type})
    df_name = f"wordnet_full-{clean_type}.parquet"
    base_df.collect().write_parquet(Path(dest_path, df_name))

100%|██████████| 1015/1015 [02:43<00:00,  6.20it/s]


In [298]:
subjects_with_wordnet.group_by("subject").agg(pl.count()).sort("count", descending=True)

subject,count
str,u32
"""<NMS_Elisabeta>""",5
"""<NMS_Amiral_Murgescu>""",5
"""<Turtle_ship>""",5
"""<Japanese_aircraft_carrier_Akagi>""",5
"""<Panokseon>""",5
"""<Russian_submarine_Nerpa_(K-152)>""",5
"""<HMS_Cadiz_(D79)>""",5
"""<NMS_Grivița>""",5
"""<NMS_Vedenia>""",5
"""<Sinking_of_MV_Nyerere>""",5


## Build subtables

In [223]:
base_path = Path("data/yadl/wordnet_full")

In [259]:
comb_size = 2
min_occurrences = 100

In [292]:
total_ = sum(1 for _ in base_path.glob("*.parquet"))

In [294]:
for pth in tqdm(base_path.glob("*.parquet"), total=total_):
    table_name = pth.stem
    new_dir = Path(base_path, table_name)
    os.makedirs(new_dir, exist_ok=True)

    tgt_table = pl.read_parquet(pth)
    if len(tgt_table) == 0:
        continue
    target_columns = tgt_table.columns[1:]
    coords_dict = {}

    # Counting the number of non-null occurrences for each combination of size `comb_size`
    for comb in combinations(target_columns, comb_size):
        coords_dict[comb] = tgt_table.select(
            pl.any_horizontal(pl.col(comb).is_not_null()).sum()
        ).item()

    # Removing from the dict combinations that don't have enough occurrences
    filtered_dict = {k: v for k, v in coords_dict.items() if v > min_occurrences}

    # Preparing a new dataframe from each surviving
    for comb in filtered_dict:
        selected_columns = tgt_table.columns[:1] + list(comb)
        new_df = tgt_table.select(selected_columns).filter(
            pl.any_horizontal(pl.col(comb).is_not_null())
        )
        fname = table_name + "-" + "-".join(selected_columns[1:]) + ".parquet"
        destination_path = Path(new_dir, fname)
        new_df.write_parquet(destination_path)

100%|██████████| 1015/1015 [11:28<00:00,  1.47it/s] 


## Focus on US Counties

In [None]:
df_types.filter(pl.col("cat_object").str.starts_with("<wordnet_")).select(
    pl.col("subject"), pl.col("cat_object")
)

In [16]:
df_facts.lazy().join(
    df_types.lazy().filter(
        pl.col("cat_object").str.to_lowercase().str.contains("wordnet_county_108546183")
    ),
    on="subject",
).filter(pl.col("predicate") == "<hasNumberOfPeople>").collect()

subject,predicate,cat_object,id,predicate_right,cat_object_right,num_object
str,str,str,str,str,str,f64
"""<Cooper_County,_Missouri>""","""<hasNumberOfPeople>""","""16670.0""","""<id_33BVL1atVr_KCM_RDiAOfRZhu>""","""rdf:type""","""<wordnet_county_108546183>""",
"""<Franklin_County,_Maine>""","""<hasNumberOfPeople>""","""29456.0""","""<id_SbYltDUWQ6_KCM_RDiAOfRZhu>""","""rdf:type""","""<wordnet_county_108546183>""",
"""<Scotts_Bluff_County,_Nebraska>""","""<hasNumberOfPeople>""","""36970.0""","""<id_b!oto0iRRh_KCM_RDiAOfRZhu>""","""rdf:type""","""<wordnet_county_108546183>""",
"""<York_County,_Virginia>""","""<hasNumberOfPeople>""","""65464.0""","""<id_TZj7GNgwsS_KCM_RDiAOfRZhu>""","""rdf:type""","""<wordnet_county_108546183>""",
"""<Lake_County,_Oregon>""","""<hasNumberOfPeople>""","""7895.0""","""<id_Kh2okdGjIT_KCM_RDiAOfRZhu>""","""rdf:type""","""<wordnet_county_108546183>""",
"""<Osage_County,_Kansas>""","""<hasNumberOfPeople>""","""15766.0""","""<id_DN6UQi10A9_KCM_RDiAOfRZhu>""","""rdf:type""","""<wordnet_county_108546183>""",
"""<Kusilvak_Census_Area,_Alaska>""","""<hasNumberOfPeople>""","""7459.0""","""<id_JE7Awgvj4r_KCM_RDiAOfRZhu>""","""rdf:type""","""<wordnet_county_108546183>""",
"""<Roger_Mills_County,_Oklahoma>""","""<hasNumberOfPeople>""","""3647.0""","""<id_tZVgoHYCRI_KCM_RDiAOfRZhu>""","""rdf:type""","""<wordnet_county_108546183>""",
"""<Rock_County,_Nebraska>""","""<hasNumberOfPeople>""","""1526.0""","""<id_?Uhzp9rAd2_KCM_RDiAOfRZhu>""","""rdf:type""","""<wordnet_county_108546183>""",
"""<Albany_County,_Wyoming>""","""<hasNumberOfPeople>""","""36299.0""","""<id_6rtttWHiis_KCM_RDiAOfRZhu>""","""rdf:type""","""<wordnet_county_108546183>""",


In [17]:
df_elections = pl.read_parquet("data/us_elections-depleted-yadl.parquet")

In [23]:
df_elections.unique("col_to_embed")

target,party,col_to_embed
f64,str,str
2.641474,"""DEMOCRAT""","""<Adams_County,_Iowa>"""
4.161578,"""DEMOCRAT""","""<Grayson_County,_Texas>"""
4.122805,"""DEMOCRAT""","""<Frederick_County,_Maryland>"""
2.190332,"""DEMOCRAT""","""<Hodgeman_County,_Kansas>"""
4.148387,"""DEMOCRAT""","""<Isabella_County,_Michigan>"""
3.009876,"""DEMOCRAT""","""<Calhoun_County,_Iowa>"""
4.07214,"""DEMOCRAT""","""<Goodhue_County,_Minnesota>"""
3.510545,"""DEMOCRAT""","""<Marquette_County,_Wisconsin>"""
2.666518,"""DEMOCRAT""","""<Norton,_Virginia>"""
4.741301,"""DEMOCRAT""","""<Oklahoma_County,_Oklahoma>"""


In [24]:
df_population = (
    df_facts.lazy()
    .join(df_elections.lazy(), left_on="subject", right_on="col_to_embed")
    .filter(pl.col("predicate") == "<hasNumberOfPeople>")
    .drop("target", "party")
    .with_columns(pl.col("cat_object").cast(float).alias("target"))
    .drop("cat_objeect")
    .unique()
    .collect()
)

In [28]:
df_population.join(
    df_population.group_by("subject").agg(pl.count()).filter(pl.col("count") > 1),
    on="subject",
).sort("subject")

subject,predicate,cat_object,target,count
str,str,str,f64,u32
"""<Acadia_Parish,_Louisiana>""","""<hasNumberOfPeople>""","""61773.0""",61773.0,2
"""<Acadia_Parish,_Louisiana>""","""<hasNumberOfPeople>""","""57576.0""",57576.0,2
"""<Accomack_County,_Virginia>""","""<hasNumberOfPeople>""","""33413.0""",33413.0,2
"""<Accomack_County,_Virginia>""","""<hasNumberOfPeople>""","""33164.0""",33164.0,2
"""<Adair_County,_Iowa>""","""<hasNumberOfPeople>""","""7496.0""",7496.0,2
"""<Adair_County,_Iowa>""","""<hasNumberOfPeople>""","""76822.0""",76822.0,2
"""<Adair_County,_Missouri>""","""<hasNumberOfPeople>""","""24977.0""",24977.0,2
"""<Adair_County,_Missouri>""","""<hasNumberOfPeople>""","""25314.0""",25314.0,2
"""<Adair_County,_Oklahoma>""","""<hasNumberOfPeople>""","""19495.0""",19495.0,2
"""<Adair_County,_Oklahoma>""","""<hasNumberOfPeople>""","""22286.0""",22286.0,2


In [47]:
df_1 = (
    df_population.sort(["subject", "target"], descending=True)
    .unique(["subject"], keep="first")
    .drop("cat_object", "predicate")
    .rename({"subject": "col_to_embed"}).with_columns(pl.col("target").log10())
)

In [48]:
df_1

col_to_embed,target
str,f64
"""<Grayson_County,_Virginia>""",4.191255
"""<Montgomery_County,_Kentucky>""",4.423229
"""<Smith_County,_Tennessee>""",4.248268
"""<Henderson_County,_Illinois>""",3.914502
"""<Victoria_County,_Texas>""",4.938485
"""<Eaton_County,_Michigan>""",5.01559
"""<Drew_County,_Arkansas>""",4.267383
"""<Richland_Parish,_Louisiana>""",4.321826
"""<Stone_County,_Mississippi>""",4.250078
"""<Bayfield_County,_Wisconsin>""",4.210051


In [45]:
df_1.write_parquet("us_county_population-depleted-yadl.parquet")