

<div align="center">
<img align="center" src="https://stelar-project.eu/wp-content/uploads/2022/08/cropped-stelar-sq.png?raw=true" alt="pyJedAI" width="250"/>

<strong>Deduplication Use Case</strong>
</div>
<div align="center">
<img align="center" src="https://github.com/AI-team-UoA/.github/blob/main/AI_LOGO-1.png?raw=true" alt="pyJedAI" width="160"/>
<img align="center" src="https://agroknow.com/wp-content/uploads/2020/04/logo_agroknow_sm.png?raw=true" alt="pyJedAI" width="160"/>
<img align="center" src="https://github.com/Nikoletos-K/pyJedAI/blob/main/docs/img/pyjedai.logo.drawio.png?raw=true" alt="pyJedAI" width="250"/>
</div>

---

# Initialize data and pyJedAI
Execute cell to initialize application and read the data.

In [6]:
%pip install pyjedai

Collecting pyjedai
  Using cached pyjedai-0.3.3-py3-none-any.whl.metadata (12 kB)
Collecting ollama (from pyjedai)
  Using cached ollama-0.5.2-py3-none-any.whl.metadata (4.3 kB)
Collecting httpx>=0.27 (from ollama->pyjedai)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting pydantic>=2.9 (from ollama->pyjedai)
  Using cached pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)
Collecting anyio (from httpx>=0.27->ollama->pyjedai)
  Using cached anyio-4.10.0-py3-none-any.whl.metadata (4.0 kB)
Collecting httpcore==1.* (from httpx>=0.27->ollama->pyjedai)
  Using cached httpcore-1.0.9-py3-none-any.whl.metadata (21 kB)
Collecting h11>=0.16 (from httpcore==1.*->httpx>=0.27->ollama->pyjedai)
  Using cached h11-0.16.0-py3-none-any.whl.metadata (8.3 kB)
Collecting annotated-types>=0.6.0 (from pydantic>=2.9->ollama->pyjedai)
  Using cached annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.33.2 (from pydantic>=2.9->ollama->pyjedai)
  Using cach

# PyJedAI Schema Matching Scores on Agroportal

In [None]:
import pandas as pd
from pyjedai.schema.schema_model import Schema
import ast
from pyjedai.joins import TopKJoin
from pyjedai.clustering import UniqueMappingClustering


results_df = pd.read_csv("Final Results/Agroportal/agroportal_results_only_schema_with_re_pr.csv")



syntactic_list  = ["weigthing_scheme", 'matching_tokenizer', "matching_vectorizer", "matching_metric"] 

prefix_columns = ['', 'syntactic-']
cluster_list = ['UNI: ', 'K: ']
suffix_list = ['F1', 'params']

columns = []
f1_columns = []
for prefix in prefix_columns:
    suffixes_list = suffix_list if prefix!='syntactic-' else suffix_list + syntactic_list
    for cluster in cluster_list:
        for suffix in suffixes_list:
            if not (prefix == 'syntactic-' and suffix == 'params'): 
                columns.append(f'{prefix}{cluster}{suffix}')
            if suffix == 'F1':
                f1_columns.append(f'{prefix}{cluster}{suffix}')

columns += ['filename', 'dataset', 'dataset_type']




results_df = results_df[columns]
results_df

Unnamed: 0,UNI: F1,UNI: params,K: F1,K: params,syntactic-UNI: F1,syntactic-UNI: weigthing_scheme,syntactic-UNI: matching_tokenizer,syntactic-UNI: matching_vectorizer,syntactic-UNI: matching_metric,syntactic-K: F1,syntactic-K: weigthing_scheme,syntactic-K: matching_tokenizer,syntactic-K: matching_vectorizer,syntactic-K: matching_metric,filename,dataset,dataset_type
0,0.0,"{'language_model': 'st5', 'similarity': 0.8500...",0.0,"{'language_model': 'st5', 'similarity': 0.8500...",58.66861503490826,JS,white_space_tokenizer,,edit_distance,58.871030169691785,JS,white_space_tokenizer,,edit_distance,agroportal,agroportal,joinable
1,,,,,,,,,,,,,,,,,


In [37]:
f1_results = results_df[f1_columns]


max_values = f1_results.max(axis=1)
max_columns = f1_results.idxmax(axis=1)


max_columns = max_columns.tolist()
max_columns = [mc.rstrip(': F1') for mc in max_columns]

print(f"Best Agroportal Workflow is {max_columns[0]}")


Best Agroportal Workflow is syntactic-K


## Syntactic K for Agroportal

Initialize Ontology's Data

In [20]:
d1 = pd.read_csv('data/agroportal/taxref-ld.csv', na_filter=True).astype(str)
d2 = pd.read_csv('data/agroportal/ncbitaxon.csv', na_filter=True).astype(str)
gt = pd.read_csv('data/agroportal/mappings_val.csv', na_filter=True).astype(str)


# Initializing Agroportal Data
source_col = d1['attributes'].to_list()
source_data = [i for i in range(len(source_col))]
source_df = pd.DataFrame(columns = source_col, data=[source_data])
 

target_col = d2['attributes'].to_list()
target_data = [i for i in range(len(target_col))]
target_df = pd.DataFrame(columns = target_col, data=[target_data])

schema = Schema(dataset_1=source_df, dataset_2= target_df, ground_truth=gt, matching_type="SCHEMA")


  d1 = pd.read_csv('data/agroportal/taxref-ld.csv', na_filter=True).astype(str)
  d2 = pd.read_csv('data/agroportal/ncbitaxon.csv', na_filter=True).astype(str)


In [21]:
schema.ground_truth

Unnamed: 0,source_index,target_index
0,140587,0
1,140587,17729
2,275401,0
3,275401,17729
8,163100,2
...,...,...
145299,9589,707246
145300,8296,707266
145301,273197,707295
145302,67004,707307


## Standard Blocking 

In [23]:
from pyjedai.block_building import StandardBlocking

sb = StandardBlocking()
blocks = sb.build_blocks(schema)


Standard Blocking: 100%|██████████| 1009277/1009277 [00:40<00:00, 25072.02it/s]


## Block Filtering

In [24]:
from pyjedai.block_cleaning import BlockFiltering

bf = BlockFiltering(ratio=0.5)
filtered_blocks = bf.process(blocks, schema)


Block Filtering: 100%|██████████| 3/3 [00:16<00:00,  5.49s/it]


## Comparison Cleaning

In [29]:
from pyjedai.comparison_cleaning import CardinalityNodePruning


ws = results_df.iloc[0]['syntactic-K: weigthing_scheme']

cc = CardinalityNodePruning(weighting_scheme=ws)
meta_blocks = cc.process(filtered_blocks, schema)


Cardinality Node Pruning: 100%|██████████| 1009277/1009277 [12:28<00:00, 1348.62it/s]


## Entity Matching

In [30]:
prefix = 'syntactic-K: '
matching_suffix = ["matching_tokenizer", "matching_vectorizer", "matching_metric"]

matching_list = [f'{prefix}{suffix}' for suffix in matching_suffix]
results_df.iloc[0][matching_list]

syntactic-K: matching_tokenizer     white_space_tokenizer
syntactic-K: matching_vectorizer                         
syntactic-K: matching_metric                edit_distance
Name: 0, dtype: object

In [31]:
from pyjedai.matching import EntityMatching

em = EntityMatching(metric='edit_distance')
graph = em.predict(meta_blocks, schema)

Entity Matching (edit_distance, white_space_tokenizer): 100%|██████████| 723843/723843 [06:34<00:00, 1832.53it/s]


## Clustering

In [32]:
from pyjedai.clustering import KiralyMSMApproximateClustering

k_cl = KiralyMSMApproximateClustering()

clusters = k_cl.process(graph, schema)
_ = k_cl.evaluate(clusters)

***************************************************************************************************************************
                                         Method:  Kiraly MSM Approximate Clustering
***************************************************************************************************************************
Method name: Kiraly MSM Approximate Clustering
Parameters: 
	Similarity Threshold: 0.1
Runtime: 44.9351 seconds
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Performance:
	Precision:     57.51% 
	Recall:        56.18%
	F1-score:      56.84%
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
