In [1]:
import duckdb
import pandas as pd

In [2]:
def prepare_table_pairs(dataset):
    datasets_path = 'processed/'
    dataset = pd.concat([pd.read_csv(f'{datasets_path}{dataset}/train.csv'),
                   pd.read_csv(f'{datasets_path}{dataset}/valid.csv'),
                   pd.read_csv(f'{datasets_path}{dataset}/test.csv')], ignore_index=True)

    columns = sorted([col[:-2] for col in dataset.columns if col.endswith('_l')])
    left_columns = [f'{col}_l' for col in columns]
    right_columns = [f'{col}_r' for col in columns]
    tableA = dataset[left_columns].drop_duplicates().rename(columns=dict(zip(left_columns, columns)))
    tableB = dataset[right_columns].drop_duplicates().rename(columns=dict(zip(right_columns, columns)))
    if 'name' in columns:
        key = 'name'
    elif 'title' in columns:
        key = 'title'
    else:
        key = columns[0]

    print('')
    table = pd.concat([tableA, tableB], ignore_index=True).drop_duplicates()
    table['text'] = table.apply(lambda x: '\t'.join([str(x[col]) for col in columns]), axis=1)
    return table[['text']]

In [3]:
def check_overlap(src_dataset, tgt_dataset):
    src_table = prepare_table_pairs(src_dataset)
    tgt_table = prepare_table_pairs(tgt_dataset)

    con = duckdb.connect()

    # Load the DataFrames into DuckDB
    con.register('src', src_table)
    con.register('tgt', tgt_table)

    sql_query = """
    WITH overlap AS (
        SELECT COUNT(*)
        FROM src
        JOIN tgt
            ON src.text = tgt.text
    )

    SELECT
        (SELECT * FROM overlap) AS overlap,
        (SELECT COUNT(*) FROM src) AS src_rows,
        (SELECT COUNT(*) FROM tgt) AS tgt_rows;
    """

    result = con.execute(sql_query).fetchall()

    # Close the connection
    con.close()
    return result

In [4]:
datasets = ['abt', 'amgo', 'beer', 'dbac', 'dbgo', 'foza', 'itam', 'roim', 'waam', 'wdc', 'zoye']
for src_dataset in datasets:
    for tgt_dataset in datasets:
        if src_dataset != tgt_dataset:
            overlap_results= check_overlap(src_dataset, tgt_dataset)
            print(f'The overlapping samples between {src_dataset} and {tgt_dataset} is {overlap_results[0][0]}')



The overlapping samples between abt and amgo is 0


The overlapping samples between abt and beer is 0


The overlapping samples between abt and dbac is 0


The overlapping samples between abt and dbgo is 0


The overlapping samples between abt and foza is 0


The overlapping samples between abt and itam is 0


The overlapping samples between abt and roim is 0


The overlapping samples between abt and waam is 0


The overlapping samples between abt and wdc is 0


The overlapping samples between abt and zoye is 0


The overlapping samples between amgo and abt is 0


The overlapping samples between amgo and beer is 0


The overlapping samples between amgo and dbac is 0


The overlapping samples between amgo and dbgo is 0


The overlapping samples between amgo and foza is 0


The overlapping samples between amgo and itam is 0


The overlapping samples between amgo and roim is 0


The overlapping samples between amgo and waam is 0


The overlapping samples between amgo and wdc is 0


The 