In [1]:
cd ..

/home/soda/rcappuzz/work/benchmark-join-suggestions


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import polars as pl
from pathlib import Path
from src.utils.indexing import load_query_result
import polars.selectors as cs
import numpy as np

In [3]:
def get_values(list_values):
    s = "    ".join(list_values) + "    "
    return [s]

In [4]:
def prepare_table(table_path):
    table = pl.read_parquet(table_path)
    table_name = table_path.stem
    res = []
    sep = " "*4
    for col in table.select(cs.string()).columns:
        values = sep.join([_ for _  in table[col].to_list() if _ is not None])
        values += " "*4
        key = f"{table_name}__{col}"
        res.append((key, values))
    return res

### Read the base table

In [5]:
base_path = Path("data/source_tables/")
tab_name = "us-accidents-yadl.parquet"
tab_path = Path(base_path, tab_name)
df = pl.read_parquet(tab_path)

### Create the `CountVectorizer`

`token_pattern=r"(?u)(<[\S]+>)[ ]{4}"` is needed to avoid issues with spaces within the 
cell values. `binary` can be set to `True` because for Jaccard Containment the only
thing that matters is having non-zero values (rather than the count).

In [6]:
cv = CountVectorizer(token_pattern=r"(?u)(<[\S]+>)[ ]{4}", binary=True)

In [7]:
values = get_values(df["col_to_embed"].to_list())
cv.fit(values)

In [8]:
len(cv.vocabulary_)

5222

Testing the overlap with a random candidate table in the YADL data lake.

I am taking a column in the table which I know can has some overlap with `col_to_embed` in the current table. Then, I transform the column. 

In [9]:
df_cand = pl.read_parquet("/home/soda/rcappuzz/work/benchmark-join-suggestions/data/wordnet_big/yagowordnet_wordnet_radio_station/radio_station_isLocatedIn_hasWebsite.parquet")
cand_values = get_values(df_cand["isLocatedIn"].to_list())

In [10]:
cv = CountVectorizer(token_pattern=r"(?u)(<[\S]+>)[ ]{4}", binary=True)
values = get_values(df["col_to_embed"].to_list())
cv.fit(values)
X = cv.transform(cand_values)

The sum of non-zero values is equivalent to the overlap between the tokens and 
the values in the column.

In [11]:
(X>0).sum()/X.shape[1]

0.375526618153964

To double check, this is the Jaccard Containment, measured explicitly.

In [12]:
left_on="col_to_embed"
right_on="isLocatedIn"
unique_source = df[left_on].unique()
unique_cand = df_cand[right_on].unique()

s1 = set(unique_source.to_list())
s2 = set(unique_cand.to_list())
print(len(s1.intersection(s2)) / len(s1))


0.375526618153964


## Testing transforming all tables

Here I am testing how to transform all tables (and their columns). 

In [13]:
rr = []
for pth in base_path.glob("*.parquet"):
    rr += prepare_table(pth)

In [14]:
X = cv.transform([_[1] for _ in rr])

## Sorting the results for querying

In [15]:
sum_res = X.sum(axis=1).ravel()
s_index = sum_res.argsort()
keys = np.array([_[0] for _ in rr])
np.flip(keys[s_index])

array([['us-accidents-yadl-clf__col_to_embed',
        'us-accidents-depleted__col_to_embed',
        'us-accidents-depleted-yadl__col_to_embed',
        'us-accidents-yadl__col_to_embed',
        'housing-open_data__INTERESTED',
        'housing-open_data-clf__PROPERTY TYPE',
        'housing-open_data-clf__SOLD DATE',
        'housing-open_data-clf__SALE TYPE',
        'company-employees-yadl-clf__col_to_embed',
        'company-employees-yadl-clf__linkedin url',
        'company-employees-yadl-clf__country',
        'company-employees-yadl-clf__locality',
        'housing-open_data-clf__CITY',
        'company-employees-yadl-clf__size range',
        'company-employees-yadl-clf__industry',
        'company-employees-yadl-clf__domain',
        'company-employees-yadl-clf__name',
        'us-presidential-results-yadl__col_to_embed',
        'us-presidential-results-yadl__party',
        'housing-open_data-clf__ADDRESS',
        'housing-open_data-clf__STATE OR PROVINCE',
        'us-p