# Introduction

This IPython notebook illustrates how to select the best learning based matcher. First, we need to import py_entitymatching package and other libraries as follows:

# Preprocessing from rSupCon directory

In [25]:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd
import copy


# Set the seed value 
seed = 0


def init_dataset_from_rsupcon(metadata_parquet_file, dataset_file, split='TRAIN', shs_side=False):
    
    def clean_string(text):
        import re
        pattern = re.compile('[^a-zA-Z0-9\s]')
        cleaned_text = re.sub(pattern, '', text)
        return cleaned_text

    """
    A = pd.read_parquet(path + table_filename).rename({"title": "video_title"}, axis=1).drop_duplicates(
        subset=["yt_id"]
    ).fillna('')
    A.description = A.description.apply(clean_string)
    B = copy.copy(A)
    """
    df_dataset = pd.read_csv(dataset_file)
    df_metadata = pd.read_parquet(metadata_parquet_file).query(f"split == '{split}'")
    
    if shs_side:
        A = df_metadata[["yt_id", "title", "performer"]].drop_duplicates(
            subset=["yt_id"]).dropna(subset=["yt_id"]).rename({"title": "video_title", 
                                                               "performer": "channel_name"}, axis=1)
        A["description"] = 'DESCRIPTION'
        A["keywords"] = 'KEYWORDS'
    else:
        A = df_metadata[["yt_id", "video_title", "channel_name", "description", "keywords"]].drop_duplicates(
            subset=["yt_id"]).dropna(subset=["yt_id"])
    
    B = df_metadata[["yt_id", "video_title", "channel_name", "description", "keywords"]].drop_duplicates(
        subset=["yt_id"]).dropna(subset=["yt_id"])
    B.description = B.description.apply(clean_string)
    
    A = A.fillna('')
    B = B.fillna('')

    def enrich(pairs):
        data = pd.merge(
            A.add_prefix("ltable_").rename({"ltable_yt_id": "ltable_id"}, axis=1),
            pd.merge(
                pairs[["ltable_id", "rtable_id", "label"]],
                B.add_prefix("rtable_").rename({"rtable_yt_id": "rtable_id"}, axis=1),
                on="rtable_id",
                how="left"
                ), 
            on="ltable_id",
            how="right"
        )
        # id formatting
        data["_id"] = range(0, len(data))
        
        # col reordering
        data = data[['_id', 'ltable_id', 'rtable_id', 'ltable_video_title', 
                        'ltable_channel_name', 'ltable_description', 'ltable_keywords', 
                        'rtable_video_title', 'rtable_channel_name', 'rtable_description', 
                        'rtable_keywords', 'label']]
        return data
        
    S = enrich(df_dataset)

    datasets_dir = em.get_install_path() + os.sep + 'datasets'

    path_A = datasets_dir + os.sep + 'shs100k2_yt_A.csv'
    path_B = datasets_dir + os.sep + 'shs100k2_yt_B.csv'
    path_labeled_data = datasets_dir + os.sep + 'shs100k2_yt_labeled.csv'

    os.remove(path_A) if os.path.exists(path_A) else None
    os.remove(path_B) if os.path.exists(path_B) else None
    os.remove(path_labeled_data) if os.path.exists(path_labeled_data) else None

    em.to_csv_metadata(A, path_A)
    em.to_csv_metadata(B, path_B)
    em.to_csv_metadata(S, path_labeled_data)

    # reading in
    A = em.read_csv_metadata(path_A, key='yt_id')
    B = em.read_csv_metadata(path_B, key='yt_id')

    S = em.read_csv_metadata(path_labeled_data, key='_id',
                            ltable=A, rtable=B, 
                            fk_ltable='ltable_id', fk_rtable='rtable_id')

    # remove the files
    os.remove(path_A) if os.path.exists(path_A) else None
    os.remove(path_B) if os.path.exists(path_B) else None
    os.remove(path_labeled_data) if os.path.exists(path_labeled_data) else None
    
    return A, B, S


metadata_parquet_file="/data/csi_datasets/shs100k2_yt.parquet"
dataset_file="/home/repos/contrastive-product-matching/data/raw/shs100k2_yt/train.csv"


A, B, S = init_dataset_from_rsupcon(metadata_parquet_file, dataset_file)


Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


In [6]:
# SIMON: blocking test
ob = em.OverlapBlocker()
#ob.block_tables(A, B, 'video_title', 'video_title', word_level=True, overlap_size=1, 
#                    l_output_attrs=['video_title', 'channel_name', 'description'], 
#                    r_output_attrs=['video_title', 'channel_name', 'description'],
#                    show_progress=False)


In [26]:

# Split S into I an J
IJ = em.split_train_test(S, train_proportion=0.5, random_state=0)
I = IJ['train']
J = IJ['test']


# Selecting the Best learning-based matcher 

This, typically involves the following steps:
1. Creating a set of learning-based matchers
2. Creating features
3. Extracting feature vectors
4. Selecting the best learning-based matcher using k-fold cross validation
5. Debugging the matcher (and possibly repeat the above steps)

## Creating a set of learning-based matchers

First, we need to create a set of learning-based matchers. The following matchers are supported in Magellan: (1) decision tree, (2) random forest, (3) naive bayes, (4) svm, (5) logistic regression, and (6) linear regression.

In [27]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')


## Creating features

Next, we need to create a set of features for the development set. Magellan provides a way to automatically generate features based on the attributes in the input tables. For the purposes of this guide, we use the automatically generated features.

In [28]:
# Generate a set of features
F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)


We observe that there were 20 features generated. As a first step, lets say that we decide to use only 'year' related features.

In [29]:
F.feature_name

0           video_title_video_title_jac_qgm_3_qgm_3
1       video_title_video_title_cos_dlm_dc0_dlm_dc0
2                       video_title_video_title_mel
3                  video_title_video_title_lev_dist
4                   video_title_video_title_lev_sim
5         channel_name_channel_name_jac_qgm_3_qgm_3
6     channel_name_channel_name_cos_dlm_dc0_dlm_dc0
7     channel_name_channel_name_jac_dlm_dc0_dlm_dc0
8                     channel_name_channel_name_mel
9                channel_name_channel_name_lev_dist
10                channel_name_channel_name_lev_sim
11                    channel_name_channel_name_nmw
12                     channel_name_channel_name_sw
13          description_description_jac_qgm_3_qgm_3
14      description_description_cos_dlm_dc0_dlm_dc0
15                keywords_keywords_jac_qgm_3_qgm_3
16            keywords_keywords_cos_dlm_dc0_dlm_dc0
Name: feature_name, dtype: object

## Extracting feature vectors

In this step, we extract feature vectors using the development set and the created features.

In [30]:
# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='label',
                            show_progress=False)


In [52]:
A

Unnamed: 0,yt_id,video_title,channel_name,description,keywords
0,E3lw8gaHs04,"George Gershwin ""Summertime"" Helen Jepson '36 ""Porgy and Bess"" Remastered",jonthesYT,"Helen Jepson, Metropolitan Opera star, singing Gershwin's ""Summertime"" from ""Porgy and Bess"" in ...",['George Gershwin' 'Summertime (song)' 'Porgy And Bess']
1,aPpJPTJc1lU,Billie Holiday - Summertime,OnlyJazzHQ,Performer(s): « Billie Holiday » & « Billie Holiday & Her Orchestra »\n« Summertime »\nAudio : V...,['Jazz' 'Mp3' 'Mp4' 'HQ' 'Haute Qualité' 'High Quality' 'HD' 'Vidéo'\n 'Clip' 'Musical' 'Musica'...
2,R44waInkjgI,"Anne Brown ""Summertime"" from Original Porgy and Bess (1940)",bsgs98,"""Introduction"" and ""Summertime""\r\nFrom the Opera, ""Porgy and Bess""\r\nMusic by George Gershwin\...",['Broadway' 'folk' 'Original cast' 'Porgy and Bess' 'opera' 'sound'\n 'track' 'Summertime' 'Anne...
3,NPRSkMIqgQA,Saunders King Rhythm - Summertime (Rhythm Recordings 2-A),boogaludo,"backed with ""Swinging Door Groove""\n\nrecorded in San Francisco, June 1942\n\nCarlos Santana mer...",['vinyl' '78 rpm' 'shellac' 'Schellack' 'Jazz' 'Deborah Sara King'\n 'Carlos Santana']
4,piVtpRkhJTA,The Ravens-Summertime,oldiesbutgoodies4you,"This song has been covered many many times by artists such as Sam Cooke,Marv Johnson,Ella Fitzge...",['jazz' 'r&b' 'pop' 'The' 'Ravens' 'Summertime' 'Sam' 'Cooke' 'Porgy'\n 'and' 'Bess' 'Jimmy' 'Ri...
...,...,...,...,...,...
71654,2S3cauISpNw,The Sachal Ensemble - Give Me Love (Give Me Peace On Earth) (Audio) ft. Seu Jorge,SachalEnsembleVEVO,Music video by The Sachal Ensemble performing Give Me Love (Give Me Peace On Earth). (C) 2016 Un...,['The' 'Sachal' 'Ensemble' 'Give' 'Me' 'Love' '(Give' 'Peace' 'On'\n 'Earth)' 'Universal' 'Music...
71655,Q6ix7lWPDAQ,Gary Stewart - Ain't Living Long Like This,steve fizzle,Gary Stewart - Ain't Living Long Like This,"['video' 'Gary Stewart (Musical Artist)'\n ""Ain't Living Long Like This (Musical Album)""\n ""Gary..."
71656,0JP_ZipG5Z0,Rodney Crowell - I ain't living long like this,baalhabeit,"the original version,\n\nout of Rodney's Crowell CD ""Ain't Living Long Like This""","['rodney' 'crowell' 'emmylou' 'harris' 'waylon' 'jennings' 'country'\n ""ain't"" ""livin'"" 'long' '..."
71657,j7MC3ckUaDA,Dream Syndicate - Ain't living Long Like This,Jay Dog,,"['dream syndicate' ""ain't living long like this""\n 'aint living long like this' 'emmylou harris'..."


In [31]:
# Display first few rows
H.head()


Unnamed: 0,_id,ltable_id,rtable_id,video_title_video_title_jac_qgm_3_qgm_3,video_title_video_title_cos_dlm_dc0_dlm_dc0,video_title_video_title_mel,video_title_video_title_lev_dist,video_title_video_title_lev_sim,channel_name_channel_name_jac_qgm_3_qgm_3,channel_name_channel_name_cos_dlm_dc0_dlm_dc0,...,channel_name_channel_name_mel,channel_name_channel_name_lev_dist,channel_name_channel_name_lev_sim,channel_name_channel_name_nmw,channel_name_channel_name_sw,description_description_jac_qgm_3_qgm_3,description_description_cos_dlm_dc0_dlm_dc0,keywords_keywords_jac_qgm_3_qgm_3,keywords_keywords_cos_dlm_dc0_dlm_dc0,label
333,333,a527ZOSJ0rY,dGLs1chXTWM,0.027027,0.0,0.486724,29,0.236842,0.0,0.0,...,0.0,12.0,0.0,-5.0,0.0,0.029289,0.0,0.09434,0.0,0
6392,6392,M6e-RlMkRx8,nAZqJg-XnU8,0.012987,0.142857,0.559921,34,0.15,0.0,0.0,...,0.374126,21.0,0.045455,-8.0,1.0,0.006557,0.0,0.048387,0.0,0
4786,4786,lfbwBdaSanw,IGjgocfuU_o,0.035294,0.129099,0.539805,53,0.171875,0.0,0.0,...,0.451282,12.0,0.076923,-2.0,1.0,0.024194,0.099015,0.053097,0.0,0
357,357,nt4emPn6du4,drPGfa2-3XE,0.0,0.0,0.511724,46,0.178571,0.0,0.0,...,0.513919,11.0,0.214286,2.0,3.0,0.061091,0.017991,0.080831,0.0,0
9663,9663,EYJKKoZc9PA,xf1N4dUPw6A,0.0,0.0,0.367725,44,0.102041,0.0,0.0,...,0.357143,7.0,0.0,0.0,1.0,,,0.064103,0.0,0


In [32]:
# Check if the feature vectors contain missing values
# A return value of True means that there are missing values
any(pd.notnull(H))

True

We observe that the extracted feature vectors contain missing values. We have to impute the missing values for the learning-based matchers to fit the model correctly. For the purposes of this guide, we impute the missing value in a column with the mean of the values in that column. 

In [33]:
# Impute feature vectors with the mean of the column values.
H = em.impute_table(H, 
                exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
                strategy='mean')


  imp.statistics_[pd.np.isnan(imp.statistics_)] = val_all_nans


## Selecting the best matcher using cross-validation

Now, we select the best matcher using k-fold cross-validation. For the purposes of this guide, we use five fold cross validation and use 'precision' metric to select the best matcher.

In [34]:
# Select the best ML matcher using CV
result = em.select_matcher([dt, rf, svm, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
        k=5,
        target_attr='label', metric_to_select_matcher='f1', random_state=0)
result['cv_stats']


Unnamed: 0,Matcher,Average precision,Average recall,Average f1
0,DecisionTree,0.720015,0.790833,0.753427
1,RF,0.954104,0.782202,0.859522
2,SVM,0.8,0.016272,0.031836
3,LinReg,0.997826,0.612561,0.758504
4,LogReg,,,


In [35]:
result['drill_down_cv_stats']['precision']


Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x7f13143d2b50>,5,0.688623,0.684848,0.757396,0.720779,0.748428,0.720015
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x7f13143d2c90>,5,0.973451,0.918033,0.976744,0.95614,0.946154,0.954104
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x7f13143d2990>,5,1.0,1.0,1.0,1.0,0.0,0.8
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x7f13143d2250>,5,1.0,0.98913,1.0,1.0,1.0,0.997826
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x7f13143d21d0>,5,,,,,,


In [36]:
result['drill_down_cv_stats']['recall']


Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x7f13143d2b50>,5,0.798611,0.768707,0.831169,0.792857,0.762821,0.790833
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x7f13143d2c90>,5,0.763889,0.761905,0.818182,0.778571,0.788462,0.782202
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x7f13143d2990>,5,0.013889,0.027211,0.025974,0.014286,0.0,0.016272
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x7f13143d2250>,5,0.597222,0.619048,0.558442,0.621429,0.666667,0.612561
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x7f13143d21d0>,5,,,,,,


In [37]:
result['drill_down_cv_stats']['f1']


Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x7f13143d2b50>,5,0.73955,0.724359,0.79257,0.755102,0.755556,0.753427
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x7f13143d2c90>,5,0.856031,0.832714,0.890459,0.858268,0.86014,0.859522
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x7f13143d2990>,5,0.027397,0.05298,0.050633,0.028169,0.0,0.031836
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x7f13143d2250>,5,0.747826,0.761506,0.716667,0.76652,0.8,0.758504
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x7f13143d21d0>,5,,,,,,


### Debug X (Random Forest)

In [38]:
# Split H into P and Q
PQ = em.split_train_test(H, train_proportion=0.5, random_state=0)
P = PQ['train']
Q = PQ['test']


In [20]:
# Debug RF matcher using GUI
# FIXME: ALWAYS CRASHES!
#em.vis_debug_rf(rf, P, Q, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], target_attr='label')


In [39]:
# Add a feature to do Jaccard on title + authors and add it to F

# Create a feature declaratively
sim = em.get_sim_funs_for_matching()
tok = em.get_tokenizers_for_matching()
feature_string = """jaccard(wspace((ltuple['video_title'] + ' ' + ltuple['description']).lower()), 
                            wspace((rtuple['video_title'] + ' ' + rtuple['description']).lower()))"""
feature = em.get_feature_fn(feature_string, sim, tok)

# Add feature to F
em.add_feature(F, 'jac_ws_video_title_description', feature)

True

In [40]:
# Convert I into feature vectors using updated F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='label',
                            show_progress=False)

TypeError: can only concatenate str (not "float") to str

In [41]:
# Check whether the updated F improves X (Random Forest)
result = em.select_matcher([rf], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
        k=5,
        target_attr='label', metric_to_select_matcher='f1', random_state=0)
result['drill_down_cv_stats']['f1']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x7f13143d2c90>,5,0.856031,0.832714,0.890459,0.858268,0.86014,0.859522


In [42]:
# Select the best matcher again using CV
result = em.select_matcher([dt, rf, svm, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
        k=5,
        target_attr='label', metric_to_select_matcher='f1', random_state=0)
result['cv_stats']

Unnamed: 0,Matcher,Average precision,Average recall,Average f1
0,DecisionTree,0.720015,0.790833,0.753427
1,RF,0.954104,0.782202,0.859522
2,SVM,0.8,0.016272,0.031836
3,LinReg,0.997826,0.612561,0.758504
4,LogReg,,,


In [43]:
result['drill_down_cv_stats']['f1']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x7f13143d2b50>,5,0.73955,0.724359,0.79257,0.755102,0.755556,0.753427
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x7f13143d2c90>,5,0.856031,0.832714,0.890459,0.858268,0.86014,0.859522
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x7f13143d2990>,5,0.027397,0.05298,0.050633,0.028169,0.0,0.031836
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x7f13143d2250>,5,0.747826,0.761506,0.716667,0.76652,0.8,0.758504
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x7f13143d21d0>,5,,,,,,


In [None]:
def load_csi_dataset(path="/data/csi_datasets/shs100k2_test.csv", yt_metadata_path="/data/yt_metadata.parquet"):
    df_test = pd.read_csv(path, sep=";")
    df_metadata = pd.read_parquet(yt_metadata_path).reset_index()
    return df_metadata.loc[df_metadata.yt_id.isin(df_test.yt_id), 
                           ["yt_id", "title", "channel_name", "description"]].rename(
        {"title": "video_title"}, axis=1)

load_csi_dataset()


: 

# Test on test datasets

In [55]:

def init_dataset_from_data(metadata_parquet_file, dataset_file, split='TEST', shs_side=False):
    
    def clean_string(text):
        import re
        pattern = re.compile('[^a-zA-Z0-9\s]')
        cleaned_text = re.sub(pattern, '', text)
        return cleaned_text

    """
    A = pd.read_parquet(path + table_filename).rename({"title": "video_title"}, axis=1).drop_duplicates(
        subset=["yt_id"]
    ).fillna('')
    A.description = A.description.apply(clean_string)
    B = copy.copy(A)
    """
    def get_dataset_df(path):
        data = pd.read_csv(path, sep=";")[["yt_id", "set_id"]]
        data["join_key"] = 1
        data = data.merge(data, on="join_key").drop("join_key", axis=1).rename(
            {"yt_id_x": "ltable_id", "yt_id_y": "rtable_id", "set_id_x": "set_id_a", 
            "set_id_y": "set_id_b"}, axis=1
        )
        data["label"] = (data.set_id_a == data.set_id_b).astype(int)
        return data

    df_dataset = get_dataset_df(dataset_file)
    df_metadata = pd.read_parquet(metadata_parquet_file).query(f"split == '{split}'")
    
    if shs_side:
        A = df_metadata[["yt_id", "title", "performer"]].drop_duplicates(
            subset=["yt_id"]).dropna(subset=["yt_id"]).rename({"title": "video_title", 
                                                               "performer": "channel_name"}, axis=1)
        A["description"] = 'DESCRIPTION'
        A["keywords"] = 'KEYWORDS'
    else:
        A = df_metadata[["yt_id", "video_title", "channel_name", "description", "keywords"]].drop_duplicates(
            subset=["yt_id"]).dropna(subset=["yt_id"])
    
    B = df_metadata[["yt_id", "video_title", "channel_name", "description", "keywords"]].drop_duplicates(
        subset=["yt_id"]).dropna(subset=["yt_id"])
    B.description = B.description.apply(clean_string)

    def enrich(pairs):
        data = pd.merge(
            A.add_prefix("ltable_").rename({"ltable_yt_id": "ltable_id"}, axis=1),
            pd.merge(
                pairs[["ltable_id", "rtable_id", "label"]],
                B.add_prefix("rtable_").rename({"rtable_yt_id": "rtable_id"}, axis=1),
                on="rtable_id",
                how="left"
                ), 
            on="ltable_id",
            how="right"
        )
        # id formatting
        data["_id"] = range(0, len(data))
        
        # col reordering
        data = data[['_id', 'ltable_id', 'rtable_id', 'ltable_video_title', 
                        'ltable_channel_name', 'ltable_description', 'ltable_keywords', 
                        'rtable_video_title', 'rtable_channel_name', 'rtable_description', 
                        'rtable_keywords', 'label']]
        return data
        
    S = enrich(df_dataset)

    datasets_dir = em.get_install_path() + os.sep + 'datasets'

    path_A = datasets_dir + os.sep + 'shs100k2test_yt_A.csv'
    path_B = datasets_dir + os.sep + 'shs100k2test_yt_B.csv'
    path_labeled_data = datasets_dir + os.sep + 'shs100k2test_yt_labeled.csv'

    os.remove(path_A) if os.path.exists(path_A) else None
    os.remove(path_B) if os.path.exists(path_B) else None
    os.remove(path_labeled_data) if os.path.exists(path_labeled_data) else None

    em.to_csv_metadata(A, path_A)
    em.to_csv_metadata(B, path_B)
    em.to_csv_metadata(S, path_labeled_data)

    # reading in
    A = em.read_csv_metadata(path_A, key='yt_id')
    B = em.read_csv_metadata(path_B, key='yt_id')

    S = em.read_csv_metadata(path_labeled_data, key='_id',
                            ltable=A, rtable=B, 
                            fk_ltable='ltable_id', fk_rtable='rtable_id')

    # remove the files
    os.remove(path_A) if os.path.exists(path_A) else None
    os.remove(path_B) if os.path.exists(path_B) else None
    os.remove(path_labeled_data) if os.path.exists(path_labeled_data) else None
    
    return A, B, S


dataset_file = "/data/csi_datasets/shs100k2_test.csv"

A, B, S = init_dataset_from_data(metadata_parquet_file, dataset_file)


In [None]:
# Generate a set of features
F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)


#  Split S into I an J
IJ = em.split_train_test(S, train_proportion=0.0, random_state=0)
I = IJ['train']
J = IJ['test']


# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='label',
                            show_progress=False)

For some attr. values in (ltable_id) in the foreign table there are no values in (yt_id) in the base table


AssertionError: Candset does not satisfy foreign key constraint with the left table