

<div align="center">
<img align="center" src="https://stelar-project.eu/wp-content/uploads/2022/08/cropped-stelar-sq.png?raw=true" alt="pyJedAI" width="250"/>

<strong>Deduplication Use Case</strong>
</div>
<div align="center">
<img align="center" src="https://github.com/AI-team-UoA/.github/blob/main/AI_LOGO-1.png?raw=true" alt="pyJedAI" width="160"/>
<img align="center" src="https://agroknow.com/wp-content/uploads/2020/04/logo_agroknow_sm.png?raw=true" alt="pyJedAI" width="160"/>
<img align="center" src="https://github.com/Nikoletos-K/pyJedAI/blob/main/docs/img/pyjedai.logo.drawio.png?raw=true" alt="pyJedAI" width="250"/>
</div>

---

# Initialize data and pyJedAI
Execute cell to initialize application and read the data.

In [1]:
%pip install pyjedai

Note: you may need to restart the kernel to use updated packages.


# Load CIQUAL-SIREN Data

In [6]:
import pandas as pd
from pyjedai.schema.schema_model import Schema

d1 = pd.read_csv('ciqual-siren/source.csv', na_filter=True).astype(str)
d2 = pd.read_csv('ciqual-siren/target.csv', na_filter=True).astype(str)
gt = pd.read_csv('ciqual-siren/mappings_of_labels.csv', na_filter=True).astype(str)


# Initializing Ciqual Siren Data
source_col = d1['label'].to_list()
source_data = [i for i in range(len(source_col))]
source_df = pd.DataFrame(columns = source_col, data=[source_data])
 

target_col = d2['label'].to_list()
target_data = [i for i in range(len(target_col))]
target_df = pd.DataFrame(columns = target_col, data=[target_data])

schema = Schema(dataset_1=source_df, dataset_2= target_df, ground_truth=gt, matching_type="SCHEMA")


# pyJedAI Semantic Schema Matching  

Best workflow for CIQUAL-SIREN was `{'language_model': 'sdistilroberta', 'similarity': 0.80, 'top_k': 1, 'reversed': False}`	

In [29]:
from pyjedai.vector_based_blocking import EmbeddingsNNBlockBuilding
from pyjedai.clustering import UniqueMappingClustering

similarity = 0.80

emb = EmbeddingsNNBlockBuilding(vectorizer='sdistilroberta')
_, graph = emb.build_blocks(schema,
    top_k=1,
    with_entity_matching=True,
    load_embeddings_if_exist=True)

# Clustering
clustering = UniqueMappingClustering()
clusters = clustering.process(graph, schema, similarity)
ev = clustering.evaluate(clusters)



Embeddings-NN Block Building [sdistilroberta, faiss, cuda]:   0%|          | 0/22693 [00:00<?, ?it/s]

***************************************************************************************************************************
                                         Method:  Unique Mapping Clustering
***************************************************************************************************************************
Method name: Unique Mapping Clustering
Parameters: 
	Similarity Threshold: 0.8
Runtime: 0.0407 seconds
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Performance:
	Precision:      0.52% 
	Recall:         0.56%
	F1-score:       0.54%
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


In [30]:
df = clustering.export_to_df(clusters)
print(f'Number of Matches found in CIQUAL-SIREN: {len(df)}')

Number of Matches found in CIQUAL-SIREN: 193


In [31]:
merge = df.merge(right=schema.dataset_1, left_on='id1', right_on='id', how='inner')
merge = merge.drop(columns=['id1', 'id'])
merge.columns = ['id2', 'ciqual']

merge = merge.merge(right=schema.dataset_2, left_on='id2', right_on='id', how='inner')

merge = merge.drop(columns=['id2', 'id'])
merge.columns = ['CIQUAL', 'SIREN']

merge

Unnamed: 0,CIQUAL,SIREN
0,Peanut,PEANUT
1,Mustard,MUSTARD
2,Nutmeg,NUTMEG
3,Prune,PRUNE
4,Brazil nut,BRAZIL NUT
...,...,...
188,"Onion, dried","onion (fried, dehydrated)@en"
189,"Bamboo shoots, raw",bamboo shoot (raw)@en
190,"Cumin, seed",CUMIN
191,"Potato, sautéed/pan-fried","potato (pre-fried, sliced)@en"


## pyJedAI Syntactic Schema Matching with Kiraly Clustering

### Standard Blocking 

In [None]:
from pyjedai.block_building import StandardBlocking

sb = StandardBlocking(disable_ray=True)
blocks = sb.build_blocks(schema)


Standard Blocking:   0%|          | 0/22693 [00:00<?, ?it/s]

[2025-11-04 14:25:03,991 E 17777 24225] core_worker_process.cc:825: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14


### Block Filtering

In [34]:
from pyjedai.block_cleaning import BlockFiltering

bf = BlockFiltering(ratio=0.5)
filtered_blocks = bf.process(blocks, schema)


Block Filtering:   0%|          | 0/3 [00:00<?, ?it/s]

### Comparison Cleaning

In [35]:
from pyjedai.comparison_cleaning import CardinalityNodePruning


ws = 'JS'	

cc = CardinalityNodePruning(weighting_scheme=ws)
meta_blocks = cc.process(filtered_blocks, schema)


Cardinality Node Pruning:   0%|          | 0/22693 [00:00<?, ?it/s]

### Entity Matching

In [None]:
from pyjedai.matching import EntityMatching

em = EntityMatching(tokenizer='white_space_tokenizer', metric='generalized_jaccard', similarity_threshold=0.85)
graph = em.predict(meta_blocks, schema)

Entity Matching (generalized_jaccard, white_space_tokenizer):   0%|          | 0/17724 [00:00<?, ?it/s]

### Clustering

In [57]:
from pyjedai.clustering import KiralyMSMApproximateClustering

k_cl = KiralyMSMApproximateClustering()

clusters = k_cl.process(graph, schema)
_ = k_cl.evaluate(clusters)

***************************************************************************************************************************
                                         Method:  Kiraly MSM Approximate Clustering
***************************************************************************************************************************
Method name: Kiraly MSM Approximate Clustering
Parameters: 
	Similarity Threshold: 0.1
Runtime: 0.0603 seconds
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Performance:
	Precision:      0.13% 
	Recall:         0.56%
	F1-score:       0.22%
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


In [59]:
df = clustering.export_to_df(clusters)
print(f'Number of Matches found in CIQUAL-SIREN: {len(df)}')
merge = df.merge(right=schema.dataset_1, left_on='id1', right_on='id', how='inner')
merge = merge.drop(columns=['id1', 'id'])
merge.columns = ['id2', 'ciqual']

merge = merge.merge(right=schema.dataset_2, left_on='id2', right_on='id', how='inner')

merge = merge.drop(columns=['id2', 'id'])
merge.columns = ['CIQUAL', 'SIREN']

merge

Number of Matches found in CIQUAL-SIREN: 741


Unnamed: 0,CIQUAL,SIREN
0,"Olive, green, stuffed (anchovy, sweet peppers,...",10006190 - SQUARE SWEET PEPPERS (BLUNT) (GS1 GPC)
1,"Squid, raw",squid raw@en
2,"Snail, raw",snail (raw)@en
3,Clear fruit brandy or eau-de-vie,33870 - BRANDY (EFSA FOODEX2)
4,"Lobster, raw",lobster (frozen)@en
...,...,...
736,"Dark chocolate bar, filled with praline",33400 - DARK ALE BEER (EFSA FOODEX2)
737,Toblerone milk cholcolate bar w nougat,43290 - NOUGAT FLAVOUR (EFSA FOODEX2)
738,"Pollen, partially dried",FAT PARTIALLY REMOVED
739,"Jam, reduced sugar",jam food product@en


## pyJedAI Join Schema Matching with Kiraly Clustering

TopK-Join Matching with arguments : `{'metric': 'cosine', 'tokenization': 'qgrams', 'qgrams': 2, 'K': 1}`

In [60]:
from pyjedai.joins import TopKJoin


join = TopKJoin(K=1, metric='cosine', tokenization='qgrams', qgrams=2)
graph = join.fit(schema)
clustering = UniqueMappingClustering()
clusters = clustering.process(graph, schema)
ev = clustering.evaluate(clusters)


Top-K Join (cosine):   0%|          | 0/19886 [00:00<?, ?it/s]

***************************************************************************************************************************
                                         Method:  Unique Mapping Clustering
***************************************************************************************************************************
Method name: Unique Mapping Clustering
Parameters: 
	Similarity Threshold: 0.1
Runtime: 0.2841 seconds
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Performance:
	Precision:      0.00% 
	Recall:         0.00%
	F1-score:       0.00%
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


In [61]:
df = clustering.export_to_df(clusters)
print(f'Number of Matches found in CIQUAL-SIREN: {len(df)}')
merge = df.merge(right=schema.dataset_1, left_on='id1', right_on='id', how='inner')
merge = merge.drop(columns=['id1', 'id'])
merge.columns = ['id2', 'ciqual']

merge = merge.merge(right=schema.dataset_2, left_on='id2', right_on='id', how='inner')

merge = merge.drop(columns=['id2', 'id'])
merge.columns = ['CIQUAL', 'SIREN']

merge

Number of Matches found in CIQUAL-SIREN: 2376


Unnamed: 0,CIQUAL,SIREN
0,Mustard,MUSTARD
1,Saffron,SAFFRON
2,Nutmeg,NUTMEG
3,Prune,PRUNE
4,Peanut,PEANUT
...,...,...
2371,"Dried pasta, gluten-free, cooked, unsalted",glycogens
2372,"Rillettes, pure goose",10006746 - DOODLES/ PUFFS (GS1 GPC)
2373,"Cooked ham, superior quality, rind less and fa...",16580 - LIMEQUATS (EFSA FOODEX2)
2374,"Lamb, meat, cooked (average)",10006414 - MEDLAR (GS1 GPC)
