In [1]:
cd ..

/home/soda/rcappuzz/work/benchmark-join-suggestions


In [2]:
import featuretools as ft
from woodwork.logical_types import Categorical, Double
import pandas as pd
import polars as pl
import pickle

In [3]:
es = ft.EntitySet()

In [4]:
candidates = pickle.load(open("generated_candidates_movies-prepared.pickle", "rb"))["minhash"]
hash_, cand = candidates.popitem()

In [5]:
source_md = vars(cand)["source_metadata"]
candidate_md = vars(cand)["candidate_metadata"]

In [6]:
source_md

{'full_path': '/storage/store/work/rcappuzz/ken_datasets/the-movies-dataset/movies-prepared.parquet',
 'hash': '310b7e82655f2beaba69e8b482d43b65',
 'df_name': 'movies-prepared',
 'source_dl': 'queries',
 'license': '',
 'path_metadata': '/storage/store/work/rcappuzz/metadata/queries/310b7e82655f2beaba69e8b482d43b65.json'}

In [7]:
def get_logical_types(df):
    num_types = df.select_dtypes("number").columns
    cat_types = [_ for _ in df.columns if _ not in num_types]
    logical_types = { col: Categorical for col in cat_types }
    logical_types.update({ col: Double for col in num_types })    
    return logical_types

In [8]:
source_table = pd.read_parquet(source_md["full_path"]).drop_duplicates("col_to_embed")
source_types = get_logical_types(source_table)
candidate_table = pd.read_parquet(candidate_md["full_path"]).reset_index(names=["index"])
candidate_types = get_logical_types(candidate_table)


In [9]:
es = es.add_dataframe(
    dataframe_name=source_md["df_name"],
    dataframe=source_table,
    index="col_to_embed",
    logical_types=source_types
)

In [10]:
es = es.add_dataframe(
    dataframe_name=candidate_md["df_name"],
    dataframe=candidate_table,
    index="index",
    logical_types=candidate_types

)

In [11]:
es = es.add_relationship(source_md["df_name"], cand.left_on[0], candidate_md["df_name"],cand.right_on[0])

In [12]:
feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name=source_md["df_name"])


In [13]:
feature_defs

[<Feature: adult>,
 <Feature: budget>,
 <Feature: genres>,
 <Feature: id>,
 <Feature: original_language>,
 <Feature: original_title>,
 <Feature: popularity>,
 <Feature: production_companies>,
 <Feature: production_countries>,
 <Feature: release_date>,
 <Feature: runtime>,
 <Feature: spoken_languages>,
 <Feature: status>,
 <Feature: title>,
 <Feature: video>,
 <Feature: vote_average>,
 <Feature: vote_count>,
 <Feature: target>,
 <Feature: COUNT(yago_binary_isLocatedIn)>,
 <Feature: MODE(yago_binary_isLocatedIn.isLocatedIn)>,
 <Feature: NUM_UNIQUE(yago_binary_isLocatedIn.isLocatedIn)>]

In [20]:
cc = [c for c in source_table.columns if c in feature_matrix.columns]

In [31]:
cat_cols = feature_matrix.select_dtypes(exclude="number").columns
num_cols = feature_matrix.select_dtypes("number").columns
# pl.from_pandas(.astype("str"))

In [43]:
df = feature_matrix.copy()

In [44]:
new_df = df.copy()
cat_cols = df.select_dtypes(exclude="number").columns
num_cols = df.select_dtypes("number").columns
for col in cat_cols:
    new_df[col] = new_df[col].astype(str)
for col in num_cols:
    new_df[col] = new_df[col].astype(float)



In [45]:
pl.from_pandas(new_df.reset_index())

col_to_embed,adult,budget,genres,id,original_language,original_title,popularity,production_companies,production_countries,release_date,runtime,spoken_languages,status,title,video,vote_average,vote_count,target,COUNT(yago_binary_isLocatedIn),MODE(yago_binary_isLocatedIn.isLocatedIn),NUM_UNIQUE(yago_binary_isLocatedIn.isLocatedIn)
cat,str,str,str,str,str,str,str,str,str,str,f64,str,str,str,str,f64,f64,f64,f64,str,f64
"""<Toy_Story>""","""False""","""30000000""","""Animation""","""862""","""en""","""Toy Story""","""21.946943""","""Pixar Animatio…","""US""","""1995""",81.0,"""English""","""Released""","""Toy Story""","""False""",7.7,5415.0,8.572353,2.0,"""<United_States…",1.0
"""<Jumanji>""","""False""","""65000000""","""Adventure""","""8844""","""en""","""Jumanji""","""17.015539""","""TriStar Pictur…","""US""","""1995""",104.0,"""English""","""Released""","""Jumanji""","""False""",6.9,2413.0,8.419621,1.0,"""<United_States…",1.0
"""<Heat_(1995_fi…","""False""","""60000000""","""Action""","""949""","""en""","""Heat""","""17.924927""","""Regency Enterp…","""US""","""1995""",170.0,"""English""","""Released""","""Heat""","""False""",7.7,1886.0,8.272855,2.0,"""<United_States…",1.0
"""<Sudden_Death_…","""False""","""35000000""","""Action""","""9091""","""en""","""Sudden Death""","""5.23158""","""Universal Pict…","""US""","""1995""",106.0,"""English""","""Released""","""Sudden Death""","""False""",5.5,174.0,7.80855,2.0,"""<United_States…",1.0
"""<Balto_(film)>…","""False""","""0""","""Family""","""21032""","""en""","""Balto""","""12.140733""","""Universal Pict…","""US""","""1995""",78.0,"""English""","""Released""","""Balto""","""False""",7.1,423.0,7.054932,2.0,"""<United_States…",1.0
"""<Nixon_(film)>…","""False""","""44000000""","""History""","""10858""","""en""","""Nixon""","""5.092""","""Hollywood Pict…","""US""","""1995""",192.0,"""English""","""Released""","""Nixon""","""False""",7.1,72.0,7.136142,2.0,"""<United_States…",1.0
"""<Cutthroat_Isl…","""False""","""98000000""","""Action""","""1408""","""en""","""Cutthroat Isla…","""7.284477""","""Le Studio Cana…","""FR""","""1995""",119.0,"""English""","""Released""","""Cutthroat Isla…","""False""",5.7,137.0,7.000752,0.0,"""nan""",
"""<Casino_(1995_…","""False""","""52000000""","""Drama""","""524""","""en""","""Casino""","""10.137389""","""Universal Pict…","""FR""","""1995""",178.0,"""English""","""Released""","""Casino""","""False""",7.8,1343.0,8.064879,2.0,"""<United_States…",1.0
"""<Four_Rooms>""","""False""","""4000000""","""Crime""","""5""","""en""","""Four Rooms""","""9.026586""","""Miramax Films""","""US""","""1995""",98.0,"""English""","""Released""","""Four Rooms""","""False""",6.5,539.0,6.633468,2.0,"""<United_States…",1.0
"""<Money_Train>""","""False""","""60000000""","""Action""","""11517""","""en""","""Money Train""","""7.337906""","""Columbia Pictu…","""US""","""1995""",103.0,"""English""","""Released""","""Money Train""","""False""",5.4,224.0,7.549385,1.0,"""<United_States…",1.0


In [48]:
for col in new_df.columns:
    try:
        pl.from_pandas(new_df.drop(col, axis=1))
        print(f"{col} does not fail, dtype {new_df[col].dtype}")
    except pl.ComputeError:
        # print(f"{col} fails, dtype {feature_matrix[col].dtype}")
        pass

adult does not fail, dtype object
budget does not fail, dtype object
genres does not fail, dtype object
id does not fail, dtype object
original_language does not fail, dtype object
original_title does not fail, dtype object
popularity does not fail, dtype object
production_companies does not fail, dtype object
production_countries does not fail, dtype object
release_date does not fail, dtype object
runtime does not fail, dtype float64
spoken_languages does not fail, dtype object
status does not fail, dtype object
title does not fail, dtype object
video does not fail, dtype object
vote_average does not fail, dtype float64
vote_count does not fail, dtype float64
target does not fail, dtype float64
COUNT(yago_binary_isLocatedIn) does not fail, dtype float64
MODE(yago_binary_isLocatedIn.isLocatedIn) does not fail, dtype object
NUM_UNIQUE(yago_binary_isLocatedIn.isLocatedIn) does not fail, dtype float64
