With this notebook we try to evaluate the exact overlap between a base table and
the candidate tables. This is to observe whether MinHash is a good proxy of this 
metric over "small" data lakes. 

Note that this approach does not scale (that's what MinHash and Lazo are for, 
after all!). 

However, our use case should be small enough for this to not be a problem. 

In [62]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
cd ..

/home/soda/rcappuzz/work/benchmark-join-suggestions


In [3]:
import polars as pl
import pandas as pd
from pathlib import Path
import os
import matplotlib.pyplot as plt
import json
import src.methods.profiling as jp
import numpy as np
import sklearn.metrics as metrics

In [4]:
mdata_root_dir = Path("data/metadata/")
yadl_case = "binary"
mdata_path = Path(mdata_root_dir, yadl_case)

base_table_path = "data/source_tables/movies-yadl-ax.parquet"
df_base = pl.read_parquet(base_table_path)



In [5]:
from src.data_structures.indices import MinHashIndex
import pickle
index = MinHashIndex()
with open("data/metadata/_indices/binary/minhash_index.pickle", "rb") as fp:
    input_dict = pickle.load(fp)
    index.load_index(index_dict=input_dict)


In [6]:
# Measuring the exact overlap between the query column and every other column in the data lake
# Overlap is saved in `overlap_dict`
overlap_dict = {}
for f in mdata_path.glob("*.json"):
    with open(f) as fp:
        mdata = json.load(fp)
        cnd_path = mdata["full_path"]
        cnd_hash = mdata["hash"]
        df_cnd = pl.read_parquet(cnd_path)
        for col in df_cnd.columns:
            pair = (cnd_hash, col)
            cont = jp.measure_containment(df_base, df_cnd, left_on=["col_to_embed"], right_on=[col])
            overlap_dict[pair] = cont

In [7]:
# Creating a dataframe to hold the dictionaries.
df_overlap = pl.from_dict({"key": list(overlap_dict.keys()), "overlap": list(overlap_dict.values())})
df_overlap = df_overlap.with_columns(
    pl.col("key").list.to_struct().struct.rename_fields(["hash", "col"])).unnest("key")


In [8]:
# Adding a column that marks as "true" all columns with overlap higher than `threshold`
threshold = 0.1
df_true = df_overlap.with_columns(
    pl.when(
        pl.col("overlap") >= threshold,
    ).then(
        1
    ).otherwise(
        0
    ).alias("mask_true")
)

In [9]:
# %%timeit
# overlap_list = []
# for f in mdata_path.glob("*.json"):
#     with open(f) as fp:
#         mdata = json.load(fp)
#         cnd_path = mdata["full_path"]
#         cnd_hash = mdata["hash"]
#         df_cnd = pl.read_parquet(cnd_path)
#         for col in df_cnd.columns:
#             pair = (cnd_hash, col)
#             cont = jp.measure_containment_join(df_base, df_cnd, left_on=["col_to_embed"], right_on=[col])
#             overlap_list.append((pair, cont))

In [10]:
# Querying index for any candidates
query_result = index.query_index(df_base["col_to_embed"].to_list())

In [11]:
# Preparing the same dataframe as before for prediction
ll = [[row[i] for row in query_result] for i in range(3)]
df_pred = pl.from_dict(dict(zip(["hash", "col", "score"], ll)))

In [12]:
# Join the two tables to measure recall 
combined = df_true.join(
    df_pred, 
    on=["hash", "col"],
    how="left"
).with_columns(
    pl.when(
        pl.col("score").is_not_null(),
    ).then(
        1
    ).otherwise(
        0
    ).alias("mask_pred")
)

### Measuring the F1 Score
In this scenario:
- `TP`: Matches with `overlap` >= `threshold` found by Minhash
- `FP`: Matches with `overlap` < `threshold`, yet marked as matches by Minhash
- `FN`: Matches with `overlap` >= `threshold` missed by Minhash
- `TN`: Matches with `overlap` < `threshold` not returned by Minhash

In [13]:
# Measuring F1 score
c_df = combined.select(
    pl.col("mask_true"),
    pl.col("mask_pred")
).to_pandas()
metrics.f1_score(c_df["mask_true"], c_df["mask_pred"])

0.9333333333333333

# Working with joblib

In [14]:
def evaluate_one_table(fpath, df_base):
    overlap_dict = {}
    with open(fpath) as fp:
        mdata = json.load(fp)
        cnd_path = mdata["full_path"]
        cnd_hash = mdata["hash"]
        df_cnd = pl.read_parquet(cnd_path)
        for col in df_cnd.columns:
            pair = (cnd_hash, col)
            cont = jp.measure_containment(df_base, df_cnd, left_on=["col_to_embed"], right_on=[col])
            overlap_dict[pair] = cont
    return overlap_dict

In [15]:
from joblib import Parallel, delayed
from tqdm import tqdm


In [16]:
yadl_case = "wordnet"
mdata_path = Path(mdata_root_dir, yadl_case)

# Building the pairwise distance with joblib
r = Parallel(n_jobs=8, verbose=0)(
    delayed(evaluate_one_table)(
        fpath, df_base
    )
    for idx, fpath in tqdm(
        enumerate(mdata_path.glob("*.json")),
        position=0,
        leave=False,
        total=sum([1 for _ in mdata_path.glob("*.json")]),
    )
)


                                                    

In [17]:
overlap_dict = {key: val for result in r for key, val in result.items()}
df_overlap = pl.from_dict({"key": list(overlap_dict.keys()), "overlap": list(overlap_dict.values())})
df_overlap = df_overlap.with_columns(
    pl.col("key").list.to_struct().struct.rename_fields(["hash", "col"])).unnest("key")


In [18]:
# Adding a column that marks as "true" all columns with overlap higher than `threshold`
threshold = 0.1
df_true = df_overlap.with_columns(
    pl.when(
        pl.col("overlap") >= threshold,
    ).then(
        1
    ).otherwise(
        0
    ).alias("mask_true")
)

In [19]:
# Querying index for any candidates
index = MinHashIndex()
with open("data/metadata/_indices/wordnet_big/minhash_index.pickle", "rb") as fp:
    input_dict = pickle.load(fp)
    index.load_index(index_dict=input_dict)
query_result = index.query_index(df_base["col_to_embed"].to_list())

In [20]:
# Preparing the same dataframe as before for prediction
ll = [[row[i] for row in query_result] for i in range(3)]
df_pred = pl.from_dict(dict(zip(["hash", "col", "score"], ll)))

In [21]:
# Join the two tables to measure recall 
combined = df_true.join(
    df_pred, 
    on=["hash", "col"],
    how="left"
).with_columns(
    pl.when(
        pl.col("score").is_not_null(),
    ).then(
        1
    ).otherwise(
        0
    ).alias("mask_pred")
)

In [22]:
# Prepare a simplified df
c_df = combined.select(
    pl.col("mask_true"),
    pl.col("mask_pred")
).to_pandas()

# Measure the F1 score
f1 = metrics.f1_score(c_df["mask_true"], c_df["mask_pred"])
print(f"F1 score: {f1:.3f}")

F1 score: 0.588


In [23]:
# Taking a look at the positive matches
combined.filter(
    (pl.col("mask_true") == 1) | (pl.col("mask_pred") == 1)
)

hash,col,overlap,mask_true,score,mask_pred
str,str,f64,i32,i64,i32
"""4ac9c8c52c4e62…","""wroteMusicFor""",0.63126,1,20.0,1
"""31dd169ccb1ae2…","""created""",0.011597,0,20.0,1
"""31dd169ccb1ae2…","""wroteMusicFor""",0.062994,0,10.0,1
"""435dd1be6f5a53…","""wroteMusicFor""",0.527412,1,20.0,1
"""9acceb4c65180a…","""created""",0.089879,0,20.0,1
"""5ba095067e3b32…","""subject""",0.742488,1,20.0,1
"""3575088b33b2f1…","""created""",0.030047,0,10.0,1
"""9d60169c13316d…","""created""",0.089352,0,20.0,1
"""63f0aeab836798…","""subject""",0.964681,1,20.0,1
"""c65197005e108f…","""created""",0.020559,0,20.0,1


In [28]:
# Preparing confusion matrix
conf_m = metrics.confusion_matrix(c_df["mask_true"], c_df["mask_pred"])
tn, fp, fn, tp = conf_m.ravel()
print(f"{'True Negative:':<20}{tn:>6}")
print(f"{'False Positive:':<20}{fp:>6}")
print(f"{'False Negative:':<20}{fn:>6}")
print(f"{'True Positive:':<20}{tp:>6}")
recall = metrics.recall_score(c_df["mask_true"], c_df["mask_pred"])
print(f"{'Recall:':<20}{recall:.3f}")
precision = metrics.precision_score(c_df["mask_true"], c_df["mask_pred"])
print(f"{'Precision:':<20}{precision:.3f}")


True Negative:       19352
False Positive:         13
False Negative:          1
True Positive:          10
Recall:             0.909
Precision:          0.435


In [26]:
cc = combined.select(
    pl.col("overlap").fill_null(0).alias("y_true"),
    pl.col("score").fill_null(0).alias("y_pred")/100,
).to_pandas()

In [27]:
metrics.r2_score(cc["y_true"], cc["y_pred"])

0.45212755190656906