In [2]:
%cd ..

/home/soda/rcappuzz/work/prepare-data-lakes


In [3]:
%load_ext autoreload
%autoreload 2
import src.yago.utils as utils

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import polars as pl
from pathlib import Path
import numpy as np
from tqdm import tqdm

In [5]:
yago_path = Path("/storage/store3/work/jstojano/yago3/")
facts1_path = Path(yago_path, "facts_parquet/yago_updated_2022_part1")
facts2_path = Path(yago_path, "facts_parquet/yago_updated_2022_part2")

## Reading yago triplets

In [6]:
fname = "yagoFacts"
yagofacts_path = Path(facts2_path, f"{fname}.tsv.parquet")
yagofacts = utils.import_from_yago(yagofacts_path, engine="polars")

In [7]:
fname = "yagoTypes"
yagotypes_path = Path(facts1_path, f"{fname}.tsv.parquet")
yagotypes = utils.import_from_yago(yagotypes_path, engine="polars")


## Some profiling

### Predicates

In [8]:
unique_facts = utils.find_unique_predicates(yagofacts)
print(unique_facts)

shape: (37, 1)
┌───────────────────────┐
│ predicate             │
│ ---                   │
│ str                   │
╞═══════════════════════╡
│ <playsFor>            │
│ <hasOfficialLanguage> │
│ <wasBornIn>           │
│ <isKnownFor>          │
│ …                     │
│ <graduatedFrom>       │
│ <hasWonPrize>         │
│ <exports>             │
│ <isAffiliatedTo>      │
└───────────────────────┘


These predicates will be used as `attributes` in the wide-form version of YAGO. 

In [9]:
count_facts=utils.count_occurrences_by_columns(yagofacts, "predicate")
print(count_facts)

shape: (37, 2)
┌───────────────────────┬─────────┐
│ predicate             ┆ count   │
│ ---                   ┆ ---     │
│ str                   ┆ u32     │
╞═══════════════════════╪═════════╡
│ <isLocatedIn>         ┆ 3289399 │
│ <isCitizenOf>         ┆ 2390826 │
│ <hasGender>           ┆ 2016273 │
│ <isAffiliatedTo>      ┆ 1569882 │
│ …                     ┆ …       │
│ <hasOfficialLanguage> ┆ 989     │
│ <exports>             ┆ 714     │
│ <hasNeighbor>         ┆ 598     │
│ <imports>             ┆ 481     │
└───────────────────────┴─────────┘


Selecting only the top 10 facts to work with. 

In [10]:
top10facts = count_facts.head(10)

### Types

In [11]:
unique_types = utils.count_occurrences_by_columns(yagotypes, "cat_object")

In [15]:
top10types= unique_types.head(10)

While looking at entity types, count the number of types each entity has and select the first for each of them. 

In [16]:
(yagotypes.lazy().groupby(
    "subject"
).agg(
    [
        pl.first("cat_object"),
        pl.count()
    ]
).sort("count", descending=True).collect())

subject,cat_object,count
str,str,u32
"""<First_Geneva_…","""<wikicat_Treat…",244
"""<Third_Geneva_…","""<wikicat_Treat…",243
"""<Second_Geneva…","""<wikicat_Treat…",242
"""<Fourth_Geneva…","""<wikicat_Treat…",241
"""<Treaty_of_Ber…","""<wikicat_Treat…",240
"""<Convention_fo…","""<wikicat_Treat…",231
"""<Hague_Hijacki…","""<wikicat_Treat…",229
"""<Montreal_Prot…","""<wikicat_Treat…",229
"""<Vienna_Conven…","""<wikicat_Treat…",228
"""<Supplementary…","""<wikicat_Treat…",228


In [15]:
most_frequent_types=(yagotypes.lazy().filter(
    pl.col("cat_object").is_in(top10types["cat_object"])
).groupby("subject").agg(
    pl.first()
).select(pl.col("subject")).collect())

In [16]:
yagotypes.lazy().filter(
    pl.col("cat_object").is_in(top10types["cat_object"])
).select(
    pl.col("subject").unique()
).collect()

subject
str
"""<Panoptic_Mode..."
"""<Fei_Shi_(Thre..."
"""<Novačka>"""
"""<Aron_Mkungilw..."
"""<Duwarka>"""
"""<Saint-David,_..."
"""<Morrisson_(si..."
"""<2010–11_Greek..."
"""<Yolanda_López..."
"""<Billy_Thomson..."


## Filter facts to include only frequent types






















Here I am filtering the subjects to keep only those whose type is in `most_frequent_types`. 

In [17]:
yagofacts_frequenttypes=(yagofacts.lazy().filter(
    pl.col("subject").is_in(most_frequent_types["subject"])
).collect())

In [18]:
pair_tab_list = []
groups = yagofacts_frequenttypes.groupby("predicate")
for group_name, group in groups:
    print(group_name)
    pair_tab=group.select(
    [    pl.col("subject").alias("subject"),
        pl.col("cat_object").alias(group_name),
    ])
    pair_tab_list.append(pair_tab)

<hasWonPrize>
<isLocatedIn>
<hasOfficialLanguage>
<imports>
<graduatedFrom>
<playsFor>
<actedIn>
<wasBornIn>
<hasCurrency>
<happenedIn>
<isCitizenOf>
<exports>
<hasChild>
<isLeaderOf>
<hasCapital>
<directed>
<wroteMusicFor>
<diedIn>
<created>
<isConnectedTo>
<participatedIn>
<worksAt>
<hasGender>
<dealsWith>
<hasAcademicAdvisor>
<hasWebsite>
<isMarriedTo>
<livesIn>
<owns>
<isAffiliatedTo>
<isPoliticianOf>
<isKnownFor>
<edited>
<influences>
<hasMusicalRole>
<isInterestedIn>


Here I am executing a self-join on `yagofacts_frequenttypes` to generate pairs of `predicate`s: these are the cases in 
which one `subject` has multiple `predicate`s, and these predicates co-occur. 

The reason why I am extracting these pairs is because this ensures that I can build tables by joining on the `subject`. 

In [19]:
cooccurring_predicates=(yagofacts_frequenttypes.lazy().join(
    yagofacts_frequenttypes.lazy(), left_on="subject",
    right_on="subject", how="left"
).select(
    [
        pl.col("predicate"),
        pl.col("predicate_right")
    ]
).groupby("predicate_right").agg(
    pl.first("predicate")
).collect())

In [20]:
cooccurring_predicates.head(10)

predicate_right,predicate
str,str
"""<wasBornIn>""","""<isLeaderOf>"""
"""<livesIn>""","""<isLeaderOf>"""
"""<isLeaderOf>""","""<isLeaderOf>"""
"""<imports>""","""<owns>"""
"""<isMarriedTo>""","""<isLeaderOf>"""
"""<isKnownFor>""","""<isLeaderOf>"""
"""<hasChild>""","""<isLeaderOf>"""
"""<exports>""","""<owns>"""
"""<hasWebsite>""","""<isLeaderOf>"""
"""<hasCurrency>""","""<owns>"""


## Plotting co-occurring pairs

Here I am trying to build a histogram that shows the pairs of columns that appear together the most, and those that never
co-occur. First off, I am looking for "at least one co-occurrence". As I'll show later, this is not very insightful 
because there is a huge variance in the co-occurrence frequency. 

In [21]:
sample_cooccurring=(yagofacts_frequenttypes.lazy().join(
    yagofacts_frequenttypes.lazy(), left_on="subject",
    right_on="subject", how="left"
).select(
    [
        pl.col("predicate"),
        pl.col("predicate_right")
    ]
).groupby(
    [
        "predicate",
        "predicate_right"
    ]
).all().collect())

In [22]:
sample_cooccurring

predicate,predicate_right
str,str
"""<imports>""","""<hasCurrency>"""
"""<isAffiliatedT...","""<diedIn>"""
"""<directed>""","""<participatedI..."
"""<graduatedFrom...","""<isKnownFor>"""
"""<hasWebsite>""","""<hasWonPrize>"""
"""<hasMusicalRol...","""<playsFor>"""
"""<hasWonPrize>""","""<isAffiliatedT..."
"""<isKnownFor>""","""<isAffiliatedT..."
"""<isPoliticianO...","""<isCitizenOf>"""
"""<livesIn>""","""<hasWonPrize>"""


In [24]:
cooccurring_predicates=(yagofacts_frequenttypes.lazy().join(
    yagofacts_frequenttypes.lazy(), left_on="subject",
    right_on="subject", how="left"
).select(
    [
        pl.col("predicate"),
        pl.col("predicate_right")
    ]
).collect())

The size of this self-join is pretty large.

In [25]:
cooccurring_predicates.shape

(152441548, 2)

Then, I am counting the number of occurrences of each pair of predicates. 

In [26]:
count_cooccurring_predicates=(cooccurring_predicates.lazy().groupby(
    ["predicate","predicate_right"]
).agg(
    pl.count()
).sort("count", descending=True).collect())

As the plot above attests, there is a huge difference in the number of occurrences of each pair: the most frequent pairs
are found millions of times, the least frequent pairs appear as few as once. 

To account for this, the heatmap has a log-normalized color bar.

## Join predicates with types

In [38]:
types_predicates=(yagotypes.lazy().filter(
        pl.col("cat_object").is_in(top10types["cat_object"])
    ).join(
        yagofacts.lazy(),
        left_on="subject",
        right_on="subject",
        how="left"
    ).select(
        pl.col("subject"),
        pl.col("cat_object").alias("type"),
        pl.col("predicate_right").alias("predicate")
    ).unique(
        ).drop_nulls(
            ).select(
                [
                    pl.col("type"),
                    pl.col("predicate")
                ]
            ).groupby(
                [
                    pl.col("type"),
                    pl.col("predicate")
                ]
            ).agg(
                [
                    pl.count()
                ]
            ).sort("count", descending=True).collect())

## Building tables

In [43]:
groups = yagotypes.lazy().join(
    top10types.lazy().select(pl.col("cat_object")),
    on="cat_object",
    how="inner"
).groupby(
    pl.col("cat_object")
).all().select(
    [
        pl.col("cat_object"), 
        pl.col("subject")
    ]
    ).collect()

In [44]:
tabs_by_type = {}
for tup in groups.iter_rows():
    type_str, values = tup
    tab = pl.DataFrame(
        {
        "type": [type_str]*len(values),
        "subject": values
        },
    )
    tabs_by_type[type_str]=tab
    print(type_str)

<wordnet_movie_106613686>
<wordnet_person_100007846>
<wordnet_album_106591815>
<wordnet_administrative_district_108491826>
<wordnet_artist_109812338>
<wordnet_company_108058098>
<wordnet_event_100029378>
<wordnet_officeholder_110371450>
<wordnet_season_115239579>
<wikicat_Living_people>


In [45]:
tabs_by_type[type_str]

type,subject
str,str
"""<wikicat_Livin...","""<Slavko_Aleksi..."
"""<wikicat_Livin...","""<Taras_Senkiv_..."
"""<wikicat_Livin...","""<Victor_Shaka>..."
"""<wikicat_Livin...","""<Jessica_Yelli..."
"""<wikicat_Livin...","""<Eugen_Tomac>"""
"""<wikicat_Livin...","""<Settimio_Todi..."
"""<wikicat_Livin...","""<Stephen_Edwar..."
"""<wikicat_Livin...","""<John_Aitpilla..."
"""<wikicat_Livin...","""<Typhoon_(rapp..."
"""<wikicat_Livin...","""<Akbar_Ahmad>"""


In [46]:
groups_predicates = yagofacts.groupby("predicate")

In [52]:
def convert_df(df: pl.DataFrame, predicate):
    return df.select(
        pl.col("subject"),
        pl.col("cat_object").alias(predicate)
    ).lazy()

In [98]:
full_tables_by_type = {}
for type_str, tab in tqdm(tabs_by_type.items(), total=len(tabs_by_type)):
    full_tables_by_type[type_str] = tab.clone()
    tqdm.write(type_str)
    for pred_name, pred_group in yagofacts.groupby("predicate"):
        if pred_name in G.neighbors(type_str) and pred_name in count_facts["predicate"]:
        # if pred_name in G.neighbors(type_str) and pred_name in count_facts[:15]["predicate"]:
            transformed_tab = convert_df(pred_group, pred_name)
            full_tables_by_type[type_str]= full_tables_by_type[type_str].lazy().join(
                transformed_tab.lazy(),
                on="subject",
                how="left"
            )
    full_tables_by_type[type_str].collect()

  0%|          | 0/10 [00:00<?, ?it/s]

<wordnet_movie_106613686>


 10%|█         | 1/10 [00:01<00:15,  1.70s/it]

<wordnet_person_100007846>


: 

: 

### Saving tables

In [None]:
dest_path = Path("data/yago3-dl")

In [None]:
dest_path.exists()

True

In [None]:
for type_str, tab in full_tables_by_type.items():
    fname = f"yago_typetab_{type_str}.parquet"
    tab.collect().write_parquet(Path(dest_path, fname))