In [1]:
cd ~/bench

/home/soda/rcappuzz/work/benchmark-join-suggestions


In [2]:
import pandas as pd
import polars as pl
from pathlib import Path

In [3]:
raw_data_path = Path("data/source_tables/raw")
yadl_data_path = Path("data/source_tables/yadl")
od_data_path = Path("data/source_tables/open_data_us")

# Movies improved

In [14]:
import ast


def clean_genres(ll):
    g = ast.literal_eval(ll)
    try:
        l1 = g[0]["name"]
        return l1
    except IndexError:
        return ""


def clean_production_companies(ll):
    try:
        g = ast.literal_eval(ll)
    except ValueError:
        return ""
    except SyntaxError:
        print(ll)
    try:
        l1 = g[0]["name"]
        return l1
    except IndexError:
        return ""
    except TypeError:
        return ""


def clean_production_country(ll):
    try:
        g = ast.literal_eval(ll)
    except ValueError:
        return ""
    try:
        l1 = g[0]["iso_3166_1"]
        return l1
    except IndexError:
        return ""
    except TypeError:
        return ""


def clean_spoken_language(ll):
    try:
        g = ast.literal_eval(ll)
    except ValueError:
        return ""
    try:
        l1 = g[0]["name"]
        return l1
    except IndexError:
        return ""
    except TypeError:
        return ""

In [15]:
df = pl.read_parquet("data/source_tables/movie_revenues.parquet").to_pandas()

In [16]:
df = df.drop(
    [
        "belongs_to_collection",
        "homepage",
        "imdb_id",
        "overview",
        "tagline",
        "poster_path",
        "release_date",
    ],
    axis=1,
)

df.genres = df.genres.apply(clean_genres)
df.production_companies = df.production_companies.apply(clean_production_companies)
df.production_countries = df.production_countries.apply(clean_production_country)
df.spoken_languages = df.spoken_languages.apply(clean_spoken_language)

In [17]:
df = df.drop(["yago4_col_to_embed", "raw_entities"], axis=1).rename(
    {"yago3_col_to_embed": "col_to_embed"}, axis=1
)

df = pl.from_pandas(df).drop_nulls("col_to_embed").to_pandas()
df["col_to_embed"] = "<" + df["col_to_embed"] + ">"

In [18]:
df.shape

(7397, 20)

In [19]:
df

Unnamed: 0,adult,budget,genres,id,original_language,original_title,popularity,production_companies,production_countries,revenue,runtime,spoken_languages,status,title,video,vote_average,vote_count,year,col_to_embed,target
0,False,30000000,Animation,862,en,Toy Story,21.946943,Pixar Animation Studios,US,373554033.0,81.0,English,Released,Toy Story,False,7.7,5415.0,1995,<Toy_Story>,8.572353
1,False,65000000,Adventure,8844,en,Jumanji,17.015539,TriStar Pictures,US,262797249.0,104.0,English,Released,Jumanji,False,6.9,2413.0,1995,<Jumanji>,8.419621
2,False,16000000,Comedy,31357,en,Waiting to Exhale,3.859495,Twentieth Century Fox Film Corporation,US,81452156.0,127.0,English,Released,Waiting to Exhale,False,6.1,34.0,1995,<Waiting_to_Exhale>,7.910903
3,False,0,Comedy,11862,en,Father of the Bride Part II,8.387519,Sandollar Productions,US,76578911.0,106.0,English,Released,Father of the Bride Part II,False,5.7,173.0,1995,<Father_of_the_Bride_Part_II>,7.884109
4,False,60000000,Action,949,en,Heat,17.924927,Regency Enterprises,US,187436818.0,170.0,English,Released,Heat,False,7.7,1886.0,1995,<Heat_(1995_film)>,8.272855
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7392,False,750000,Crime,280422,ru,Все и сразу,0.201582,Кинокомпания «Lunapark»,RU,3.0,0.0,Pусский,Released,All at Once,False,6.0,4.0,2014,<All_at_Once_(film)>,0.477121
7393,False,0,Drama,240789,ru,Чудо,0.436028,Central Partnership,RU,50656.0,110.0,Pусский,Released,The Miracle,False,6.3,3.0,2009,<The_Miracle_(2009_film)>,4.704631
7394,False,800000,Comedy,62757,en,Dikari,0.903061,,,1328612.0,100.0,Pусский,Released,Savages,False,5.8,6.0,2006,<Savages_(2006_film)>,6.123398
7395,False,2000000,Romance,63281,en,Про любоff,0.121844,Profit,RU,1268793.0,107.0,Pусский,Released,Pro Lyuboff,False,4.0,3.0,2010,<Pro_Lyuboff_(2010_film)>,6.103391


In [25]:
df.to_parquet(Path(yadl_data_path, "movies_large-yadl.parquet"))

In [30]:
df_yadl = pl.read_parquet(Path(yadl_data_path, "movies_large-yadl.parquet"))

df_yadl.select(pl.col("target"), pl.col("col_to_embed")).write_parquet(
    Path(yadl_data_path, "movies_large-yadl-depleted.parquet")
)

df_vote = df_yadl.with_columns(target=pl.col("vote_average")).drop("vote_average")
df_vote.write_parquet(Path(yadl_data_path, "movies_vote_large-yadl.parquet"))
df_vote.select(pl.col("target"), pl.col("col_to_embed")).write_parquet(
    Path(yadl_data_path, "movies_vote_large-yadl-depleted.parquet")
)

df_yadl.select(pl.col("original_title"), pl.col("target")).write_parquet(
    Path(od_data_path, "movies_large-depleted-open_data.parquet")
)

df_vote.select(pl.col("original_title"), pl.col("target")).write_parquet(
    Path(od_data_path, "movies_vote_large-depleted-open_data.parquet")
)

In [31]:
df_vote.shape

(7397, 19)

In [32]:
yadl_data_path

PosixPath('data/source_tables/yadl')

# US Accidents improved

In [4]:
df = pl.read_parquet("data/source_tables/us_accidents.parquet")

In [5]:
df = (
    df.with_columns(
        County=pl.col("raw_entities")
        .str.split(",")
        .list.to_struct()
        .struct.rename_fields(["County", "State"])
    )
    .unnest("County")
    .drop("raw_entities", "yago4_col_to_embed")
    .rename({"yago3_col_to_embed": "col_to_embed"})
    .with_columns(
        col_to_embed="<" + pl.col("col_to_embed") + ">"
    )
)

In [6]:
df_2021 = df.filter(pl.col("Year") == 2021).drop("Year")

In [39]:
df_2021.write_parquet(Path(od_data_path, "us_accidents_2021-open_data.parquet"))
df_2021.write_parquet(Path(yadl_data_path, "us_accidents_2021-yadl.parquet"))
df_2021.select("County", "State","target").write_parquet(Path(od_data_path, "us_accidents_2021-depleted-open_data_County.parquet"))
df_2021.select("col_to_embed","target").write_parquet(Path(yadl_data_path, "us_accidents_2021-yadl-depleted.parquet"))

In [40]:
df.write_parquet(Path(od_data_path, "us_accidents_large-open_data.parquet"))
df.write_parquet(Path(yadl_data_path, "us_accidents_large-yadl.parquet"))
df.select("County", "State","Year","target").write_parquet(Path(od_data_path, "us_accidents_large-depleted-open_data_County.parquet"))
df.select("Year","col_to_embed","target").write_parquet(Path(yadl_data_path, "us_accidents_large-yadl-depleted.parquet"))

# Company employees

In [8]:
df = pl.read_parquet(Path(raw_data_path, "company_employees.parquet"))
df.write_parquet(Path(od_data_path, "company_employees-open_data.parquet"))

In [10]:
df_yadl = pl.read_parquet(Path(yadl_data_path, "company_employees-yadl.parquet"))
df_yadl.select(pl.col("target"), pl.col("col_to_embed")).write_parquet(
    Path(yadl_data_path, "company_employees-depleted-yadl.parquet")
)

df.select(pl.col("name"), pl.col("target")).write_parquet(
    Path(od_data_path, "company_employees-depleted-open_data.parquet")
)

# Housing prices

In [13]:
df = pl.read_parquet(Path(yadl_data_path, "housing_prices-yadl.parquet"))
df.write_parquet(Path(yadl_data_path, "housing_prices-yadl.parquet"))
df.write_parquet(Path(raw_data_path, "housing_prices-yadl.parquet"))
df.drop("col_to_embed").write_parquet(
    Path(od_data_path, "housing_prices-open_data.parquet")
)

In [14]:
df

RegionID,SizeRank,City,RegionType,StateName,Code,Metro,County,State,target,col_to_embed
i64,i64,str,str,str,str,str,str,str,f64,str
6181,0,"""New York""","""city""","""NY""","""NY""","""New York-Newar…","""Queens County""","""New York""",5.854972,"""<New_York_City…"
17222,43,"""Buffalo""","""city""","""NY""","""NY""","""Buffalo-Cheekt…","""Erie County""","""New York""",5.334042,"""<Buffalo,_New_…"
832063,45,"""Rochester""","""city""","""NY""","""NY""","""Rochester, NY""","""Monroe County""","""New York""",5.322101,"""<Rochester,_Ne…"
34937,145,"""Yonkers""","""city""","""NY""","""NY""","""New York-Newar…","""Westchester Co…","""New York""",5.779882,"""<Yonkers,_New_…"
7353,149,"""Syracuse""","""city""","""NY""","""NY""","""Syracuse, NY""","""Onondaga Count…","""New York""",5.233641,"""<Syracuse,_New…"
40779,229,"""Schenectady""","""city""","""NY""","""NY""","""Albany-Schenec…","""Schenectady Co…","""New York""",5.435997,"""<Schenectady,_…"
37074,246,"""Albany""","""city""","""NY""","""NY""","""Albany-Schenec…","""Albany County""","""New York""",5.455378,"""<Albany,_New_Y…"
34819,698,"""White Plains""","""city""","""NY""","""NY""","""New York-Newar…","""Westchester Co…","""New York""",5.825207,"""<White_Plains,…"
831538,699,"""Binghamton""","""city""","""NY""","""NY""","""Binghamton, NY…","""Broome County""","""New York""",5.200401,"""<Binghamton,_N…"
26114,705,"""New Rochelle""","""city""","""NY""","""NY""","""New York-Newar…","""Westchester Co…","""New York""",5.908062,"""<New_Rochelle,…"


# US Elections

In [7]:
df = pl.read_parquet(Path(yadl_data_path, "us_elections-yadl.parquet"))
df_dem = df.filter(pl.col("party") == "DEMOCRAT")
df_dem.write_parquet(Path(yadl_data_path, "us_elections_dem-yadl.parquet"))
df_dem.drop("col_to_embed").write_parquet(
    Path(od_data_path, "us_elections_dem-open_data.parquet")
)
df_dem.select(pl.col("target"), pl.col("col_to_embed")).write_parquet(
    Path(yadl_data_path, "us_elections_dem-yadl-depleted.parquet")
)
df_dem.select(pl.col("target"), pl.col("county_name")).write_parquet(
    Path(od_data_path, "us_elections_dem-open_data-depleted.parquet")
)