

<div align="center">
<img align="center" src="https://stelar-project.eu/wp-content/uploads/2022/08/cropped-stelar-sq.png?raw=true" alt="pyJedAI" width="250"/>

<strong>Deduplication Use Case</strong>
</div>
<div align="center">
<img align="center" src="https://github.com/AI-team-UoA/.github/blob/main/AI_LOGO-1.png?raw=true" alt="pyJedAI" width="160"/>
<img align="center" src="https://agroknow.com/wp-content/uploads/2020/04/logo_agroknow_sm.png?raw=true" alt="pyJedAI" width="160"/>
<img align="center" src="https://github.com/Nikoletos-K/pyJedAI/blob/main/docs/img/pyjedai.logo.drawio.png?raw=true" alt="pyJedAI" width="250"/>
</div>

---

# Initialize data and pyJedAI
Execute cell to initialize application and read the data.

In [1]:
%pip install pyjedai

Note: you may need to restart the kernel to use updated packages.


# Load Eurocrops Germany Lower Saxony Data 

Loading Eurocrops Germany Lower Saxony Data for composite Schema Matching

In [46]:
import pandas as pd
from pyjedai.schema.schema_model import Schema

country = "DE_LS"

dbf_name = f"Eurocrops/{country}/{country}_dbf.csv"
file_name = f"Eurocrops/{country}/{country}.csv"
gtruth_filename = f"Eurocrops/{country}/{country}.json"

import json 
with open(gtruth_filename, 'r') as f:
    gtruth_data = json.load(f)


d1 = pd.read_csv(dbf_name)
d2 = pd.read_csv(file_name)
gt = pd.DataFrame(gtruth_data['matches'])


schema = Schema(dataset_1=d1, dataset_2= d2, ground_truth=gt, matching_type="SCHEMA")

# pyJedAI Semantic Schema Matching  

Best workflow for Eurocrops-AT was `{'language_model': 'sminilm', 'similarity': 0.1, 'top_k': 1, 'reversed': False}`	

In [47]:
from pyjedai.vector_based_blocking import EmbeddingsNNBlockBuilding
from pyjedai.clustering import UniqueMappingClustering

similarity = 0.1

emb = EmbeddingsNNBlockBuilding(vectorizer='sminilm')
_, graph = emb.build_blocks(schema,
    top_k=1,
    with_entity_matching=True,
    load_embeddings_if_exist=False)

# Clustering
clustering = UniqueMappingClustering()
clusters = clustering.process(graph, schema, similarity)
ev = clustering.evaluate(clusters)



Embeddings-NN Block Building [sminilm, faiss, cuda]:   0%|          | 0/17 [00:00<?, ?it/s]

***************************************************************************************************************************
                                         Method:  Unique Mapping Clustering
***************************************************************************************************************************
Method name: Unique Mapping Clustering
Parameters: 
	Similarity Threshold: 0.1
Runtime: 0.0003 seconds
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Performance:
	Precision:     75.00% 
	Recall:        42.86%
	F1-score:      54.55%
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


In [48]:
df = clustering.export_to_df(clusters)
print(f'Number of Matches found in Eurocrops-{country}: {len(df)}')

Number of Matches found in Eurocrops-DE_LS: 4


In [50]:
merge = df.merge(right=schema.dataset_1, left_on='id1', right_on='id', how='inner')
merge = merge.drop(columns=['id1', 'id'])
merge.columns = ['id2', f'{country}-DBF']

merge = merge.merge(right=schema.dataset_2, left_on='id2', right_on='id', how='inner')

merge = merge.drop(columns=['id2', 'id'])
merge.columns = [f'{country}-DBF', country]

merge

Unnamed: 0,DE_LS-DBF,DE_LS
0,EC_hcat_c,HCAT3_name
1,KULTURCODE,original_code
2,EC_trans_n,translated_name
3,EC_org_n,original_name


## pyJedAI Join Schema Matching with Kiraly Clustering

TopK-Join Matching with arguments : `{'metric': 'cosine', 'tokenization': 'qgrams', 'qgrams': 2, 'K': 1}`

In [61]:
from pyjedai.joins import TopKJoin


join = TopKJoin(K=1, metric='cosine', tokenization='qgrams', qgrams=2)
graph = join.fit(schema)
clustering = UniqueMappingClustering()
clusters = clustering.process(graph, schema)
ev = clustering.evaluate(clusters)


Top-K Join (cosine):   0%|          | 0/7 [00:00<?, ?it/s]

***************************************************************************************************************************
                                         Method:  Unique Mapping Clustering
***************************************************************************************************************************
Method name: Unique Mapping Clustering
Parameters: 
	Similarity Threshold: 0.1
Runtime: 0.0007 seconds
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Performance:
	Precision:    100.00% 
	Recall:        71.43%
	F1-score:      83.33%
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


In [63]:
df = clustering.export_to_df(clusters)
print(f'Number of Matches found in Eurocrops-{country}: {len(df)}')
merge = df.merge(right=schema.dataset_1, left_on='id1', right_on='id', how='inner')
merge = merge.drop(columns=['id1', 'id'])
merge.columns = ['id2', f'{country}-DBF']

merge = merge.merge(right=schema.dataset_2, left_on='id2', right_on='id', how='inner')

merge = merge.drop(columns=['id2', 'id'])
merge.columns = [f'{country}-DBF', country]

merge

Number of Matches found in Eurocrops-DE_LS: 5


Unnamed: 0,DE_LS-DBF,DE_LS
0,EC_hcat_n,HCAT3_name
1,EC_hcat_c,HCAT3_code
2,EC_trans_n,translated_name
3,KULTURCODE,original_code
4,EC_org_n,original_name
