In [1]:
cd ..

/home/soda/rcappuzz/work/benchmark-join-suggestions


In [2]:
import featuretools as ft
from woodwork.logical_types import Categorical, Double
import pandas as pd
import polars as pl
import pickle

In [26]:
es = ft.EntitySet()

In [27]:
candidates = pickle.load(open("generated_candidates_presidential-results-prepared.pickle", "rb"))["minhash"]
hash_, cand = candidates.popitem()

In [28]:
source_md = vars(cand)["source_metadata"]
candidate_md = vars(cand)["candidate_metadata"]

In [29]:
source_md

{'full_path': '/storage/store/work/rcappuzz/ken_datasets/presidential-results/presidential-results-prepared.parquet',
 'hash': 'afe785ccde50f3f1473fea96a39911c7',
 'df_name': 'presidential-results-prepared',
 'source_dl': 'queries',
 'license': '',
 'path_metadata': '/storage/store/work/rcappuzz/metadata/queries/afe785ccde50f3f1473fea96a39911c7.json'}

In [30]:
candidate_md

{'full_path': '/storage/store/work/rcappuzz/yago3-dl/wordnet_big/yagowordnet_wordnet_mountain/mountain_wasCreatedOnDate_isLocatedIn_hasLatitude.parquet',
 'hash': '47047eac539c3ed1f2682160f5101961',
 'df_name': 'mountain_wasCreatedOnDate_isLocatedIn_hasLatitude',
 'source_dl': 'yago3-dl',
 'license': '',
 'path_metadata': '/storage/store/work/rcappuzz/metadata/wordnet_big/47047eac539c3ed1f2682160f5101961.json'}

In [31]:
def get_logical_types(df):
    num_types = df.select_dtypes("number").columns
    cat_types = [_ for _ in df.columns if _ not in num_types]
    logical_types = { col: Categorical for col in cat_types }
    logical_types.update({ col: Double for col in num_types })    
    return logical_types

In [44]:
source_table = pd.read_parquet(source_md["full_path"])

# DEDUPLICATION IS DONE ON A NEW TABLE
# DFS fails if I don't deduplicate the column, so I need to do this step. 
dedup_table = source_table.drop_duplicates("col_to_embed")

source_types = get_logical_types(dedup_table)
candidate_table = pd.read_parquet(candidate_md["full_path"]).reset_index(names=["index"])
candidate_types = get_logical_types(candidate_table)


In [33]:
es = es.add_dataframe(
    dataframe_name=source_md["df_name"],
    dataframe=dedup_table,
    index="col_to_embed",
    logical_types=source_types
)

In [34]:
es = es.add_dataframe(
    dataframe_name=candidate_md["df_name"],
    dataframe=candidate_table,
    index="index",
    logical_types=candidate_types

)

In [35]:
es = es.add_relationship(source_md["df_name"], cand.left_on[0], candidate_md["df_name"],cand.right_on[0])

In [36]:
feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name=source_md["df_name"])


In [37]:
feature_defs

[<Feature: year>,
 <Feature: state>,
 <Feature: state_po>,
 <Feature: county_name>,
 <Feature: county_fips>,
 <Feature: office>,
 <Feature: candidate>,
 <Feature: party>,
 <Feature: totalvotes>,
 <Feature: version>,
 <Feature: mode>,
 <Feature: target>,
 <Feature: COUNT(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude)>,
 <Feature: MODE(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude.hasLatitude)>,
 <Feature: MODE(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude.subject)>,
 <Feature: MODE(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude.type)>,
 <Feature: MODE(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude.wasCreatedOnDate)>,
 <Feature: NUM_UNIQUE(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude.hasLatitude)>,
 <Feature: NUM_UNIQUE(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude.subject)>,
 <Feature: NUM_UNIQUE(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude.type)>,
 <Feature: NUM_UNIQUE(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude.wasCreatedOnDate)>]

In [38]:
cc = [c for c in dedup_table.columns if c in feature_matrix.columns]

In [39]:
cat_cols = feature_matrix.select_dtypes(exclude="number").columns
num_cols = feature_matrix.select_dtypes("number").columns
# pl.from_pandas(.astype("str"))

In [40]:
df = feature_matrix.copy()

In [41]:
new_df = df.copy()
cat_cols = df.select_dtypes(exclude="number").columns
num_cols = df.select_dtypes("number").columns
for col in cat_cols:
    new_df[col] = new_df[col].astype(str)
for col in num_cols:
    new_df[col] = new_df[col].astype(float)



In [43]:
new_table = pl.from_pandas(new_df.reset_index())

In [21]:
for col in new_df.columns:
    try:
        pl.from_pandas(new_df.drop(col, axis=1))
        print(f"{col} does not fail, dtype {new_df[col].dtype}")
    except pl.ComputeError:
        # print(f"{col} fails, dtype {feature_matrix[col].dtype}")
        pass

year does not fail, dtype object
state does not fail, dtype object
state_po does not fail, dtype object
county_name does not fail, dtype object
county_fips does not fail, dtype object
office does not fail, dtype object
candidate does not fail, dtype object
party does not fail, dtype object
totalvotes does not fail, dtype object
version does not fail, dtype object
mode does not fail, dtype object
target does not fail, dtype float64
COUNT(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude) does not fail, dtype float64
MODE(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude.hasLatitude) does not fail, dtype object
MODE(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude.subject) does not fail, dtype object
MODE(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude.type) does not fail, dtype object
MODE(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude.wasCreatedOnDate) does not fail, dtype object
NUM_UNIQUE(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude.hasLatitude) does not fail, dtype float6

In [65]:
feat_columns = [col for col in new_df.columns if col not in source_table.columns]
augmented_table = source_table.merge(
    new_df[feat_columns].reset_index(),
    how="left",
    on="col_to_embed"
)

In [66]:
augmented_table

Unnamed: 0,year,state,state_po,county_name,county_fips,office,candidate,party,totalvotes,version,...,target,COUNT(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude),MODE(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude.hasLatitude),MODE(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude.subject),MODE(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude.type),MODE(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude.wasCreatedOnDate),NUM_UNIQUE(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude.hasLatitude),NUM_UNIQUE(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude.subject),NUM_UNIQUE(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude.type),NUM_UNIQUE(mountain_wasCreatedOnDate_isLocatedIn_hasLatitude.wasCreatedOnDate)
0,2020,Alabama,AL,Autauga,01001,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,27770,20220315,...,3.875293,0.0,,,,,,,,
1,2020,Alabama,AL,Autauga,01001,US PRESIDENT,OTHER,OTHER,27770,20220315,...,2.633468,0.0,,,,,,,,
2,2020,Alabama,AL,Autauga,01001,US PRESIDENT,DONALD J TRUMP,REPUBLICAN,27770,20220315,...,4.297520,0.0,,,,,,,,
3,2020,Alabama,AL,Baldwin,01003,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,109679,20220315,...,4.390564,0.0,,,,,,,,
4,2020,Alabama,AL,Baldwin,01003,US PRESIDENT,OTHER,OTHER,109679,20220315,...,3.192567,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22079,2020,Wyoming,WY,Washakie,56043,US PRESIDENT,DONALD J TRUMP,REPUBLICAN,4032,20220315,...,3.511349,0.0,,,,,,,,
22080,2020,Wyoming,WY,Weston,56045,US PRESIDENT,JOSEPH R BIDEN JR,DEMOCRAT,3560,20220315,...,2.557507,0.0,,,,,,,,
22081,2020,Wyoming,WY,Weston,56045,US PRESIDENT,JO JORGENSEN,LIBERTARIAN,3560,20220315,...,1.672098,0.0,,,,,,,,
22082,2020,Wyoming,WY,Weston,56045,US PRESIDENT,OTHER,OTHER,3560,20220315,...,1.681241,0.0,,,,,,,,
