

<div align="center">
<img align="center" src="https://stelar-project.eu/wp-content/uploads/2022/08/cropped-stelar-sq.png?raw=true" alt="pyJedAI" width="250"/>

<strong>Deduplication Use Case</strong>
</div>
<div align="center">
<img align="center" src="https://github.com/AI-team-UoA/.github/blob/main/AI_LOGO-1.png?raw=true" alt="pyJedAI" width="160"/>
<img align="center" src="https://agroknow.com/wp-content/uploads/2020/04/logo_agroknow_sm.png?raw=true" alt="pyJedAI" width="160"/>
<img align="center" src="https://github.com/Nikoletos-K/pyJedAI/blob/main/docs/img/pyjedai.logo.drawio.png?raw=true" alt="pyJedAI" width="250"/>
</div>

---

# Initialize data and pyJedAI
Execute cell to initialize application and read the data.

In [1]:
import plotly.express as px
import os
import pandas as pd

# Do this BEFORE importing the package
import tqdm
import tqdm.notebook

# Replace all tqdm with notebook version
tqdm.tqdm = tqdm.notebook.tqdm
tqdm.trange = tqdm.notebook.trange

# Now import your package

%pip install pyjedai

# from google.colab import drive
# drive.mount(r'/content/drive/')

# incidents = os.path.abspath(r"content/drive/My Drive/Projects/STELAR/data/Agroknow/incidents.csv")

d1 = pd.read_csv("incidents.csv", sep=',', na_filter=False)


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# LIBS
import sys
import networkx
from networkx import draw, Graph
import pyjedai
from pyjedai.datamodel import Data
from pyjedai.utils import (
    text_cleaning_method,
    print_clusters,
    print_blocks,
    print_candidate_pairs
)
from pyjedai.evaluation import Evaluation




Note: you may need to restart the kernel to use updated packages.


# Preparing data for pyJedAI

In [2]:
num_of_entities = d1.shape[0]
d1.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
print("Number of initial entities: ", num_of_entities)

Number of initial entities:  11139


# Data preview - Agroknow Incidents

In [3]:
d1

Unnamed: 0,id,date,originalTitle,description,product,hazard,productCategory,hazardCategory,supplier,url
0,0,2022-07-30T00:00:00,Rachael’s Food Corporation Recalls Ready-To-Ea...,024-2022\r\n\r\n \r\n High - Class I\r\n\r\n...,wraps,listeria monocytogenes,prepared dishes and snacks,biological,Rachael’s Food Corporation,https://www.fsis.usda.gov/recalls-alerts/racha...
1,1,2022-07-30T00:00:00,"Conagra Brands, Inc., Recalls Frozen Beef Prod...",025-2022\r\n\r\n \r\n High - Class I\r\n\r\n...,frozen beef products,eggs and products thereof,meat and meat products (other than poultry),allergens,Conagra Brands,https://www.fsis.usda.gov/recalls-alerts/conag...
2,2,2022-07-29T00:00:00,Lyons Magnus Voluntarily Recalls 53 Nutritiona...,"FRESNO, Calif. – July 28, 2022 – Lyons Magnus ...",non-alcoholic beverages,cronobacter sakazakii,non-alcoholic beverages,biological,Lyons Magnus LLC,https://www.fda.gov/safety/recalls-market-with...
3,3,2022-07-28T00:00:00,Certain Groove Chocolate brand and Daniel Choc...,Food recall warning Certain Groove Chocolate b...,dark chocolate bars,milk and products thereof,"cocoa and cocoa preparations, coffee and tea",allergens,Daniel Chocolates Groove Chocolate Inc.,https://recalls-rappels.canada.ca/en/alert-rec...
4,4,2022-07-28T00:00:00,Danny’s Sub and Pizza Recalls Meat Pizza Produ...,023-2022\r\n\r\n \r\n High - Class I\r\n\r\n...,pizza,unauthorised use of federal inspection mark,prepared dishes and snacks,fraud,Danny’s Sub and Pizza,https://www.fsis.usda.gov/recalls-alerts/danny...
...,...,...,...,...,...,...,...,...,...,...
11134,11134,1994-05-05T00:00:00,Recall Notification: FSIS-017-94,Case Number: 017-94 \n Date Opene...,ham slices,listeria spp,meat and meat products (other than poultry),biological,KOEGEL MEATS,https://www.fsis.usda.gov/wps/portal/fsis/topi...
11135,11135,1994-04-03T00:00:00,Recall Notification: FSIS-009-94,Case Number: 009-94 \n Date Opene...,frankfurter sausages,plastic fragment,meat and meat products (other than poultry),foreign bodies,OSCAR MAYER FOODS,https://www.fsis.usda.gov/wps/portal/fsis/topi...
11136,11136,1994-03-28T00:00:00,Recall Notification: FSIS-014-94,Case Number: 014-94 \r\n Date Ope...,ham slices,listeria monocytogenes,meat and meat products (other than poultry),biological,WILLOW FOODS INC,https://www.fsis.usda.gov/wps/portal/fsis/topi...
11137,11137,1994-03-10T00:00:00,Recall Notification: FSIS-033-94,Case Number: 033-94 \n Date Opene...,sausage,listeria spp,meat and meat products (other than poultry),biological,WIMMER'S MEAT PRODUCTS,https://www.fsis.usda.gov/wps/portal/fsis/topi...


## Keeps only Description

In [4]:
AK_processed = d1[['id', 'description']]
AK_processed

Unnamed: 0,id,description
0,0,024-2022\r\n\r\n \r\n High - Class I\r\n\r\n...
1,1,025-2022\r\n\r\n \r\n High - Class I\r\n\r\n...
2,2,"FRESNO, Calif. – July 28, 2022 – Lyons Magnus ..."
3,3,Food recall warning Certain Groove Chocolate b...
4,4,023-2022\r\n\r\n \r\n High - Class I\r\n\r\n...
...,...,...
11134,11134,Case Number: 017-94 \n Date Opene...
11135,11135,Case Number: 009-94 \n Date Opene...
11136,11136,Case Number: 014-94 \r\n Date Ope...
11137,11137,Case Number: 033-94 \n Date Opene...


In [5]:
import re
def process_dataframe(df, column_name):
    # Make sure the column exists in the DataFrame
    if column_name not in df.columns:
        raise ValueError(f"Column {column_name} not found in DataFrame")

    # Define a function to clean each entry
    def clean_text(text):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\\.', '', text)  # Remove backslashes and any character they escape
        text = re.sub(r'[\r\n]+', ' ', text)  # Replace \r and \n with a space
        return text

    # Apply the cleaning function to the specified column using .loc
    df.loc[:, column_name] = df[column_name].apply(clean_text)

    return df

AK_processed = d1[['id', 'description']]
AK_processed = process_dataframe(AK_processed, 'description')
AK_processed.head(5)

Unnamed: 0,id,description
0,0,024-2022 high - class i product conta...
1,1,025-2022 high - class i misbranding ...
2,2,"fresno, calif. – july 28, 2022 – lyons magnus ..."
3,3,food recall warning certain groove chocolate b...
4,4,023-2022 high - class i produced with...


# Number of exact duplicates

In [6]:
AK_processed = AK_processed.drop_duplicates(subset = ['description'])
print("Number of exact duplicates: ",  num_of_entities - AK_processed.shape[0])

Number of exact duplicates:  3546


# Data preview after dropping Exact Duplicates
AK_processed

In [7]:
data = Data(dataset_1=AK_processed, id_column_name_1='id')
data.print_specs()

***************************************************************************************************************************
                                                   Data Report
***************************************************************************************************************************
Type of Entity Resolution:  Dirty
Dataset 1 (D1):
	Number of entities:  7593
	Number of NaN values:  0
	Memory usage [MB]:  24.44
	Attributes:
		 description

Total number of entities:  7593
───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


# Data preview without Exact duplicates

In [8]:
AK_processed

Unnamed: 0,id,description
0,0,024-2022 high - class i product conta...
1,1,025-2022 high - class i misbranding ...
2,2,"fresno, calif. – july 28, 2022 – lyons magnus ..."
3,3,food recall warning certain groove chocolate b...
4,4,023-2022 high - class i produced with...
...,...,...
11134,11134,case number: 017-94 date opened...
11135,11135,case number: 009-94 date opened...
11136,11136,case number: 014-94 date opened...
11137,11137,case number: 033-94 date opened...


In [9]:
# Check for duplicates using groupby
duplicates = AK_processed.groupby('description').size().reset_index(name='counts')
duplicates = duplicates[duplicates['counts'] > 1]
print(duplicates)

Empty DataFrame
Columns: [description, counts]
Index: []


# pyJedAI Applications on Data __without__ exact duplicates

# __Vector based blocking with Pytorch pre-trained embeddings__
---

In [None]:
from pyjedai.clustering import ConnectedComponentsClustering, CenterClustering
from pyjedai.vector_based_blocking import EmbeddingsNNBlockBuilding

# -------------------------------------- #

LANGUAGE_MODEL = "sminilm"
TOPK = 10
CLUSTERING = ConnectedComponentsClustering
THRESHOLD = 0.95

# -------------------------------------- #



emb = EmbeddingsNNBlockBuilding(vectorizer=LANGUAGE_MODEL,
                                similarity_search='faiss')

blocks, g = emb.build_blocks(data,
                             top_k=TOPK,
                             similarity_distance='cosine',
                             load_embeddings_if_exist=False,
                             save_embeddings=True,
                             with_entity_matching=True)


ccc = CLUSTERING()
clusters = ccc.process(g, data, similarity_threshold=THRESHOLD)
nn_pairs_df = ccc.export_to_df(clusters)

print("\n\nPredicted " , nn_pairs_df.shape[0], " matches.")

merged_df = pd.merge(nn_pairs_df.astype(str), AK_processed.astype(str), how='left', left_on='id1', right_on='id')
final_df = pd.merge(merged_df, AK_processed.astype(str), how='left', left_on='id2', right_on='id', suffixes=("_1", "_2"))
final_df.rename(columns={'id1': 'Predicted id1', 'id2': 'Predicted id2', 'id_1': 'Original id1', 'id_2': 'Original id2'}, inplace=True)



Embeddings-NN Block Building [sminilm, faiss, cuda]:   0%|          | 0/7593 [00:00<?, ?it/s]



Predicted  583  matches.


In [12]:
print("\n\nPredicted " , nn_pairs_df.shape[0], " matches.")



Predicted  583  matches.


## Predicted Duplicates

In [13]:
# @title Predicted Duplicates
final_df.drop(columns=['Predicted id1', 'Predicted id2'], inplace=True)

final_df = final_df.replace('\n', '', regex=True)
final_df.to_csv("candidate_pairs_no_dups.csv", index=False)


In [14]:
final_df

Unnamed: 0,Original id1,description_1,Original id2,description_2
0,1060,food recall warning certain abbott brand powde...,158,food recall warning certain abbott brand powde...
1,10006,"for immediate release - june 12, 2013 - oskri ...",10009,"for immediate release - june 12, 2013 - oskri ..."
2,6,imported biscuit may contain allergen (peanuts...,7,imported biscuit may contain allergen (peanuts...
3,2524,updated food recall warning (allergen) - co-op...,2572,food recall warning (allergen) - co-op gold pu...
4,2524,updated food recall warning (allergen) - co-op...,2409,food recall warning (allergen) - co-op gold pu...
...,...,...,...,...
578,11109,case number: 021-97 recall notification repor...,11103,case number: 015-97 recall notification repor...
579,11109,case number: 021-97 recall notification repor...,11104,case number: 018-97 recall notification repor...
580,11094,case number: 037-98 recall notification repor...,11103,case number: 015-97 recall notification repor...
581,11094,case number: 037-98 recall notification repor...,11104,case number: 018-97 recall notification repor...
