In [1]:
%cd ..

/home/soda/rcappuzz/work/prepare-data-lakes


In [2]:
import polars as pl
import pandas as pd
from pathlib import Path
import src.yago.utils as utils
import numpy as np
import datetime

In [3]:
cfg = pl.Config()
cfg.set_fmt_str_lengths(30)

polars.config.Config

In [4]:
yago_path = Path("/storage/store3/work/jstojano/yago3/")
facts1_path = Path(yago_path, "facts_parquet/yago_updated_2022_part1")
facts2_path = Path(yago_path, "facts_parquet/yago_updated_2022_part2")
fname = "yagoTypes"
yagotypes_path = Path(facts1_path, f"{fname}.tsv.parquet")
df_types = utils.import_from_yago(yagotypes_path)
fname = "yagoFacts"
yagofacts_path = Path(facts2_path, f"{fname}.tsv.parquet")
yagofacts = utils.import_from_yago(yagofacts_path)
fname = "yagoLiteralFacts"
yagoliteralfacts_path = Path(facts2_path, f"{fname}.tsv.parquet")
yagoliteralfacts = utils.import_from_yago(yagoliteralfacts_path)
fname = "yagoDateFacts"
yagodatefacts_path = Path(facts2_path, f"{fname}.tsv.parquet")
yagodatefacts = utils.import_from_yago(yagodatefacts_path)
yagofacts = yagofacts.drop("num_object")
yagodatefacts = (
    yagodatefacts.with_columns(
        pl.col("cat_object")
        .str.split("^^")
        .list.first()
        .str.to_datetime(strict=False)
        .dt.date()
        .cast(pl.Utf8)
        .alias("cat_object")
    )
    .drop_nulls("cat_object")
    .drop("num_object")
)
yagoliteralfacts = yagoliteralfacts.with_columns(
    pl.when(pl.col("num_object").is_not_null())
    .then(pl.col("num_object"))
    .otherwise(pl.col("cat_object"))
    .alias("cat_object")
).drop("num_object")
df_facts = pl.concat([yagofacts, yagoliteralfacts, yagodatefacts]).drop("id")

In [5]:
df_facts

subject,predicate,cat_object
str,str,str
"""<Elizabeth_II>""","""<isLeaderOf>""","""<Royal_Numismatic_Society>"""
"""<Andrew_Harvey_(politician)>""","""<isLeaderOf>""","""<New_Denmark,_New_Brunswick>"""
"""<Andranik>""","""<isLeaderOf>""","""<Armenian_fedayi>"""
"""<Yakubu_Oseni>""","""<isLeaderOf>""","""<Kogi_State>"""
"""<Wan_Saiful_Wan_Jan>""","""<isLeaderOf>""","""<Perbadanan_Tabung_Pendidikan…"
"""<Kurt_Diebner>""","""<isLeaderOf>""","""<German_nuclear_weapons_progr…"
"""<Achyuta_Samanta>""","""<isLeaderOf>""","""<Kandhamal_district>"""
"""<Elisabetta_Belloni>""","""<isLeaderOf>""","""<Dipartimento_delle_Informazi…"
"""<Mike_Harris_Jr.>""","""<isLeaderOf>""","""<Wellesley,_Ontario>"""
"""<Jenny_Manson>""","""<isLeaderOf>""","""<Jewish_Voice_for_Labour>"""


In [6]:
path_labels = Path(
    yago_path, "facts_parquet/yago_updated_2022_part2/wikipediaLabels.tsv.parquet"
)
df_labels = utils.import_from_yago(path_labels)

In [7]:
filtered_labels = (
    df_labels.lazy()
    .filter(pl.col("subject").is_in(df_facts.select(pl.col("subject")).to_series()))
    .unique(["subject", "predicate"])
    .collect()
)

In [8]:
filtered_labels = filtered_labels.filter(
    pl.col("cat_object").str.ends_with("@eng")
).with_columns(pl.col("cat_object").str.split("@").list[0])

In [9]:
filtered_labels

id,subject,predicate,cat_object,num_object
str,str,str,str,f64
"""<id_XMlLz7H!Er_SBd_eh?ORlGiPb…","""<Eddie_Kaye_Thomas>""","""rdfs:label""","""Eddie Kaye Thomas""",
"""<id_9majCmBIA0_SBd_1LqivY8vPd…","""<Lev_Vlassenko>""","""rdfs:label""","""Lev Vlassenko""",
"""<id_271Se8XZQk_SBd_mi4EI2kla2…","""<Göran_Lagerberg>""","""rdfs:label""","""Goeran Lagerberg""",
"""<id_oqBvksmjY8_SBd_oqBvksmjY8…","""<Castletownshend>""","""rdfs:label""","""Castletownshend""",
"""<id_DSazflcOli_UVh_AGniDKuJUh…","""<Lawrence_Hubert>""","""skos:prefLabel""","""Lawrence Hubert""",
"""<id_TfoRu6C!AS_SBd_f2LjgEN89E…","""<Ellen_Hardin_Walworth>""","""rdfs:label""","""Ellen Hardin Walworth""",
"""<id_DTKbRWR5pe_SBd_iILr9wb02x…","""<Anna_Wangenheim>""","""rdfs:label""","""Anna Wangenheim""",
"""<id_NH?65?dDHb_UVh_NH?65?dDHb…","""<Mbegha>""","""skos:prefLabel""","""Mbegha""",
"""<id_UxGCF5bY4t_SBd_4CojpqYAEm…","""<Harry_Brockhouse>""","""rdfs:label""","""Harry Brockhouse""",
"""<id_JI8AsQ!PaG_UVh_aX0OdtGsKp…","""<Samsun–Ceyhan_pipeline>""","""skos:prefLabel""","""Samsun–Ceyhan pipeline""",


# Movies

In [10]:
df_movies = pl.read_csv(
    "data/base_tables/the-movies-dataset/movies_metadata.csv",
    infer_schema_length=10000,
    ignore_errors=True,
)

In [11]:
wordnet_movie_type = (
    df_types.lazy()
    .filter(pl.col("cat_object").str.starts_with("<wordnet_"))
    .unique("cat_object")
    .filter(pl.col("cat_object").str.contains("movie"))
    .collect()["cat_object"]
    .item()
)

In [12]:
facts_movies = df_types.filter(pl.col("cat_object") == wordnet_movie_type)

In [13]:
movies_labels = (
    facts_movies.lazy()
    .join(filtered_labels.lazy(), on="subject")
    .select(["subject", "cat_object_right"])
    .unique()
    .rename({"cat_object_right": "label"})
    .collect()
)

In [14]:
movies_labels

subject,label
str,str
"""<Français_Pour_une_Nuit>""","""Francais Pour une Nuit"""
"""<Return_to_Boggy_Creek>""","""Return to Boggy Creek"""
"""<The_Wrong_Man>""","""The Wrong Man"""
"""<Ghajini_(2008_film)>""","""Ghajini"""
"""<We_Monsters>""","""We Monsters"""
"""<Confessions_of_a_Police_Capt…","""Confessions of a Police Capta…"
"""<Elisabeth_und_der_Narr>""","""Elisabeth und der Narr"""
"""<The_Fighting_Kentuckian>""","""The Fighting Kentuckian"""
"""<Saptapadi_(1981_film)>""","""Saptapadi (1981 film)"""
"""<One-Two,_Soldiers_Were_Going…","""One-Two"""


In [15]:
filtered_movies = (
    df_movies.lazy()
    .join(movies_labels.lazy(), left_on="original_title", right_on="label")
    .collect()
)

In [16]:
import ast


def clean_genres(ll):
    g = ast.literal_eval(ll)
    try:
        l1 = g[0]["name"]
        return l1
    except IndexError:
        return ""

In [17]:
filtered_movies

adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,subject
bool,str,i64,str,str,i64,str,str,str,str,f64,str,str,str,str,i64,f64,str,str,str,str,bool,f64,i64,str
false,,1200000,"""[{'id': 80, 'name': 'Crime'},…",,22527,"""tt0051207""","""en""","""The Wrong Man""","""True story of an innocent man…",8.56081,"""/o3L64eO7RegwjTFlp5a872aZvjq.…","""[{'name': 'Warner Bros.', 'id…","""[{'iso_3166_1': 'US', 'name':…","""1956-12-22""",2000000,105.0,"""[{'iso_639_1': 'en', 'name': …","""Released""","""Somewhere...somewhere there m…","""The Wrong Man""",false,6.9,105,"""<The_Wrong_Man>"""
false,,0,"""[{'id': 10749, 'name': 'Roman…",,41666,"""tt0108597""","""en""","""The Wrong Man""","""US merchant sailor Alex Walke…",1.972377,"""/wmWLfgo2WWVB3WHkGYgmQhE4neZ.…","""[{'name': 'Viacom Productions…","""[{'iso_3166_1': 'US', 'name':…","""1993-09-05""",0,104.0,"""[{'iso_639_1': 'en', 'name': …","""Released""","""Accused of a crime he didn't …","""The Wrong Man""",false,4.8,5,"""<The_Wrong_Man>"""
false,,9100000,"""[{'id': 53, 'name': 'Thriller…","""http://www.rememberghajini.co…",14070,"""tt1166100""","""hi""","""Ghajini""","""Sanjay a rich tycoon sufferin…",4.23908,"""/qACwdCtoaatv8qge2sSWlYpz3yN.…","""[{'name': 'Geetha Arts', 'id'…","""[{'iso_3166_1': 'IN', 'name':…","""2008-12-25""",76000000,183.0,"""[{'iso_639_1': 'hi', 'name': …","""Released""",,"""Ghajini""",false,6.9,84,"""<Ghajini_(2008_film)>"""
false,,3200000,"""[{'id': 28, 'name': 'Action'}…",,69636,"""tt0449951""","""ta""","""Ghajini""","""The film's story is around a …",0.802284,"""/1QYjtWpKyNTX83RE8C4DRKYW6b6.…","""[{'name': 'Sri Saravanaa Crea…","""[{'iso_3166_1': 'IN', 'name':…","""2005-10-14""",0,150.0,"""[{'iso_639_1': 'ta', 'name': …","""Released""",,"""Ghajini""",false,6.6,13,"""<Ghajini_(2008_film)>"""
false,,0,"""[{'id': 27, 'name': 'Horror'}…",,28438,"""tt0036104""","""en""","""The Leopard Man""","""When a leopard escapes during…",1.494525,"""/AgypGd78CZ5KEe7N3vYRFOxlDR8.…","""[{'name': 'RKO Radio Pictures…","""[{'iso_3166_1': 'US', 'name':…","""1943-05-08""",0,66.0,"""[{'iso_639_1': 'en', 'name': …","""Released""","""Woman alone the victims of st…","""The Leopard Man""",false,6.4,18,"""<The_Leopard_Man>"""
false,,0,"""[{'id': 18, 'name': 'Drama'},…",,108639,"""tt0460724""","""en""","""Blush""","""In 2004 Wim Vandekeybus shot …",0.122913,"""/4PoDgbDwjUuTJqu816MM0EznXZu.…","""[{'name': 'CCCP', 'id': 2852}…","""[{'iso_3166_1': 'FR', 'name':…","""2005-12-07""",0,55.0,"""[{'iso_639_1': 'en', 'name': …","""Released""",,"""Blush""",false,4.0,1,"""<Blush_(2019_film)>"""
false,,0,"""[{'id': 99, 'name': 'Document…",,434336,"""tt4456270""","""en""","""The Sunshine Makers""","""The story of Nicholas Sand an…",0.787708,"""/cr9Nddno9yoqjFgIfpbEZoHDmiv.…","""[]""","""[]""","""2015-11-01""",0,101.0,"""[]""","""Released""",,"""The Sunshine Makers""",false,7.4,5,"""<The_Sunshine_Makers_(1935_fi…"
false,,0,"""[]""",,44666,"""tt0114583""","""it""","""Surprise""","""While his lover is sleeping, …",0.001252,"""/tfYsCJd9lsZjfS2LVqqHoxw1QqF.…","""[]""","""[]""","""1996-02-21""",0,6.0,"""[]""","""Released""",,"""Surprise""",false,0.0,0,"""<Surprise_(2015_film)>"""
false,,0,"""[{'id': 28, 'name': 'Action'}…",,1647,"""tt0292506""","""en""","""The Recruit""","""A brilliant CIA trainee must …",11.485473,"""/hQuFWLIKUwWUWYhk6KHsBOKKeHc.…","""[{'name': 'Birnbaum / Barber …","""[{'iso_3166_1': 'US', 'name':…","""2003-01-31""",101191884,115.0,"""[{'iso_639_1': 'en', 'name': …","""Released""","""Trust. Betrayal. Deception. I…","""The Recruit""",false,6.2,545,"""<The_Recruit>"""
false,,0,"""[{'id': 35, 'name': 'Comedy'}…",,53734,"""tt0305583""","""en""","""A Foreign Affair""","""Two brothers need household h…",0.85986,"""/osdQcxKpF6ZVeOLTle5JUIcT66k.…","""[]""","""[]""","""2003-01-21""",0,82.0,"""[]""","""Released""","""They're not looking for love,…","""A Foreign Affair""",false,5.8,4,"""<A_Foreign_Affair>"""


In [18]:
def fun(row: pl.Series):
    r = (
        row.map_elements(ast.literal_eval)
        .list[0]
        .struct.rename_fields(["id", "name"])
        .struct.field("name")
    )
    return r

In [19]:
copied = filtered_movies.clone()

In [20]:
copied

adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,subject
bool,str,i64,str,str,i64,str,str,str,str,f64,str,str,str,str,i64,f64,str,str,str,str,bool,f64,i64,str
false,,1200000,"""[{'id': 80, 'name': 'Crime'},…",,22527,"""tt0051207""","""en""","""The Wrong Man""","""True story of an innocent man…",8.56081,"""/o3L64eO7RegwjTFlp5a872aZvjq.…","""[{'name': 'Warner Bros.', 'id…","""[{'iso_3166_1': 'US', 'name':…","""1956-12-22""",2000000,105.0,"""[{'iso_639_1': 'en', 'name': …","""Released""","""Somewhere...somewhere there m…","""The Wrong Man""",false,6.9,105,"""<The_Wrong_Man>"""
false,,0,"""[{'id': 10749, 'name': 'Roman…",,41666,"""tt0108597""","""en""","""The Wrong Man""","""US merchant sailor Alex Walke…",1.972377,"""/wmWLfgo2WWVB3WHkGYgmQhE4neZ.…","""[{'name': 'Viacom Productions…","""[{'iso_3166_1': 'US', 'name':…","""1993-09-05""",0,104.0,"""[{'iso_639_1': 'en', 'name': …","""Released""","""Accused of a crime he didn't …","""The Wrong Man""",false,4.8,5,"""<The_Wrong_Man>"""
false,,9100000,"""[{'id': 53, 'name': 'Thriller…","""http://www.rememberghajini.co…",14070,"""tt1166100""","""hi""","""Ghajini""","""Sanjay a rich tycoon sufferin…",4.23908,"""/qACwdCtoaatv8qge2sSWlYpz3yN.…","""[{'name': 'Geetha Arts', 'id'…","""[{'iso_3166_1': 'IN', 'name':…","""2008-12-25""",76000000,183.0,"""[{'iso_639_1': 'hi', 'name': …","""Released""",,"""Ghajini""",false,6.9,84,"""<Ghajini_(2008_film)>"""
false,,3200000,"""[{'id': 28, 'name': 'Action'}…",,69636,"""tt0449951""","""ta""","""Ghajini""","""The film's story is around a …",0.802284,"""/1QYjtWpKyNTX83RE8C4DRKYW6b6.…","""[{'name': 'Sri Saravanaa Crea…","""[{'iso_3166_1': 'IN', 'name':…","""2005-10-14""",0,150.0,"""[{'iso_639_1': 'ta', 'name': …","""Released""",,"""Ghajini""",false,6.6,13,"""<Ghajini_(2008_film)>"""
false,,0,"""[{'id': 27, 'name': 'Horror'}…",,28438,"""tt0036104""","""en""","""The Leopard Man""","""When a leopard escapes during…",1.494525,"""/AgypGd78CZ5KEe7N3vYRFOxlDR8.…","""[{'name': 'RKO Radio Pictures…","""[{'iso_3166_1': 'US', 'name':…","""1943-05-08""",0,66.0,"""[{'iso_639_1': 'en', 'name': …","""Released""","""Woman alone the victims of st…","""The Leopard Man""",false,6.4,18,"""<The_Leopard_Man>"""
false,,0,"""[{'id': 18, 'name': 'Drama'},…",,108639,"""tt0460724""","""en""","""Blush""","""In 2004 Wim Vandekeybus shot …",0.122913,"""/4PoDgbDwjUuTJqu816MM0EznXZu.…","""[{'name': 'CCCP', 'id': 2852}…","""[{'iso_3166_1': 'FR', 'name':…","""2005-12-07""",0,55.0,"""[{'iso_639_1': 'en', 'name': …","""Released""",,"""Blush""",false,4.0,1,"""<Blush_(2019_film)>"""
false,,0,"""[{'id': 99, 'name': 'Document…",,434336,"""tt4456270""","""en""","""The Sunshine Makers""","""The story of Nicholas Sand an…",0.787708,"""/cr9Nddno9yoqjFgIfpbEZoHDmiv.…","""[]""","""[]""","""2015-11-01""",0,101.0,"""[]""","""Released""",,"""The Sunshine Makers""",false,7.4,5,"""<The_Sunshine_Makers_(1935_fi…"
false,,0,"""[]""",,44666,"""tt0114583""","""it""","""Surprise""","""While his lover is sleeping, …",0.001252,"""/tfYsCJd9lsZjfS2LVqqHoxw1QqF.…","""[]""","""[]""","""1996-02-21""",0,6.0,"""[]""","""Released""",,"""Surprise""",false,0.0,0,"""<Surprise_(2015_film)>"""
false,,0,"""[{'id': 28, 'name': 'Action'}…",,1647,"""tt0292506""","""en""","""The Recruit""","""A brilliant CIA trainee must …",11.485473,"""/hQuFWLIKUwWUWYhk6KHsBOKKeHc.…","""[{'name': 'Birnbaum / Barber …","""[{'iso_3166_1': 'US', 'name':…","""2003-01-31""",101191884,115.0,"""[{'iso_639_1': 'en', 'name': …","""Released""","""Trust. Betrayal. Deception. I…","""The Recruit""",false,6.2,545,"""<The_Recruit>"""
false,,0,"""[{'id': 35, 'name': 'Comedy'}…",,53734,"""tt0305583""","""en""","""A Foreign Affair""","""Two brothers need household h…",0.85986,"""/osdQcxKpF6ZVeOLTle5JUIcT66k.…","""[]""","""[]""","""2003-01-21""",0,82.0,"""[]""","""Released""","""They're not looking for love,…","""A Foreign Affair""",false,5.8,4,"""<A_Foreign_Affair>"""


In [21]:
copied = copied.with_columns(
    pl.col(["genres", "spoken_languages", "production_countries"])
    .map_elements(lambda x: ast.literal_eval(x))
    .list[0]
    .struct.rename_fields(["id", "name"])
    .struct.field("name")
    .map_alias(lambda x: x)
    # .map_alias(lambda x: x + "_")
).with_columns(
    pl.col(["production_companies"])
    .map_elements(lambda x: ast.literal_eval(x))
    .list[0]
    .struct.rename_fields(["name", "id"])
    .struct.field("name")
    .alias("production_companies")
    # .map_alias(lambda x: x + "_")
)

  .map_alias(lambda x: x)


In [22]:
copied = (
    copied.select(
        [
            "adult",
            "budget",
            "genres",
            "original_language",
            "original_title",
            "popularity",
            "production_companies",
            "production_countries",
            "release_date",
            "revenue",
            "runtime",
            "spoken_languages",
            "status",
            "title",
            "vote_average",
            "vote_count",
            "subject",
        ]
    )
    .rename({"vote_average": "target", "subject": "col_to_embed"})
    .drop_nulls("target")
)

In [23]:
yago_path = Path("/storage/store3/work/jstojano/yago3/")
facts1_path = Path(yago_path, "facts_parquet/yago_updated_2022_part1")
facts2_path = Path(yago_path, "facts_parquet/yago_updated_2022_part2")
fname = "yagoLabels"
yagolabels_path = Path(facts1_path, f"{fname}.tsv.parquet")
df_yagolabels = utils.import_from_yago(yagolabels_path)

In [24]:
movies_labels.head()

subject,label
str,str
"""<Français_Pour_une_Nuit>""","""Francais Pour une Nuit"""
"""<Return_to_Boggy_Creek>""","""Return to Boggy Creek"""
"""<The_Wrong_Man>""","""The Wrong Man"""
"""<Ghajini_(2008_film)>""","""Ghajini"""
"""<We_Monsters>""","""We Monsters"""


In [27]:
collected_labels = (movies_labels.lazy().join(
    df_yagolabels.lazy(), left_on="subject", right_on="subject"
).filter(
    pl.col("predicate") == "skos:prefLabel"
    ).filter(~pl.col("label").str.ends_with("film)"))
.select(pl.col("subject"), pl.col("label")).unique().collect())

In [29]:
(
    df_movies.lazy()
    .join(collected_labels.lazy(), left_on="original_title", right_on="label")
    .group_by("title").agg(pl.count().alias("count")).sort("count")
    .collect()
)

  .group_by("title").agg(pl.count().alias("count")).sort("count")


title,count
str,u32
"""So Evil, So Young""",1
"""Special""",1
"""Hombre""",1
"""Catfish""",1
"""Ratcatcher""",1
"""The Million Dollar Duck""",1
"""Morris from America""",1
"""Portrait of Jason""",1
"""Death Warrant""",1
"""Let Him Have It""",1
