# Entity Matching using Neo4j - Error analysis

_Salomon Tetelepta, April 27th 2024_
* Notebook to dig into the errors that remain after entity matching using a graph representation of the Abt-Buy Dataset

### Install dependencies

In [1]:
!pip install neo4j python-dotenv langchain-community --quiet

%load_ext watermark
%watermark -p neo4j

neo4j: 5.17.0



### Imports

In [15]:
from dotenv import load_dotenv, find_dotenv, dotenv_values
from langchain_community.graphs import Neo4jGraph
from pathlib import Path
from sklearn.manifold import TSNE
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from wordfreq import lossy_tokenize, tokenize, word_frequency

import json
import matplotlib.pyplot as plt
import neo4j
import numpy as np

import os
import pandas as pd
import pickle
import re

### Settings

In [4]:
# path settings
project_path = Path(os.getcwd()).parent
data_path = project_path / "data"
output_path = project_path / "output"

database = "abt-buy"

# load env settings
load_dotenv(project_path / ".env")

# reproducability
np.random.seed(42)

### 1. Load Data

In [5]:
os.listdir(data_path / 'abt-buy' / 'record_descriptions')

['2_buy.csv', '1_abt.csv']

In [6]:
# abt and buy records
df_abt = pd.read_csv(data_path / 'abt-buy' / 'record_descriptions' / '1_abt.csv', encoding='unicode_escape')
df_buy = pd.read_csv(data_path / 'abt-buy' / 'record_descriptions' / '2_buy.csv')

# matches - train and validation set
df_train = pd.read_csv(data_path / 'abt-buy' / 'gs_train.csv')
df_val = pd.read_csv(data_path / 'abt-buy' / 'gs_val.csv')

# merge records with matches
df_abt_merged = df_abt.merge(df_train, left_on='subject_id', right_on='source_id', how='right')
df_train_merged = df_buy.merge(df_abt_merged, left_on='subject_id', right_on='target_id', how='right')

df_train_merged.head(3)

Unnamed: 0,subject_id_x,name_x,description_x,manufacturer,price_x,subject_id_y,name_y,description_y,price_y,source_id,target_id,matching
0,207390654,Sony Handycam HDR-SR10 High Definition Digital...,16:9 - 2.7' Hybrid LCD,Sony,549.0,33161,Sony High Definition HDV Handycam Camcorder - ...,Sony High Definition HDV Handycam Camcorder - ...,,33161,207390654,False
1,208085180,Pioneer DEH-2000MP Car Audio Player,"CD-RW - CD-Text, MP3, WMA, WAV - LCD - 4 - 200...",Pioneer,84.0,36258,D-Link Broadband Cable Modem - DCM202,D-Link Broadband Cable Modem - DCM202/ DOCSIS ...,79.0,36258,208085180,False
2,90125786,Sanus Wall/Ceiling Speaker Mount - WMS3S SILVER,Plastic - 8 lb,Sanus,,17417,Sanus 13' - 30' VisionMount Flat Panel TV Silv...,Sanus 13' - 30' VisionMount Flat Panel TV Silv...,39.99,17417,90125786,False


### Connect to Neo4j

In [12]:
# connect to Neo4j
graph = Neo4jGraph(
    url=os.getenv('NEO4J_URL'),
    username=os.getenv('NEO4J_USER'),
    password=os.getenv('NEO4J_PASS')
)

# create database if does not exist
graph._database = "system"
query = f"CREATE DATABASE `{database}` IF NOT EXISTS"
graph.query(query)

# change to target database
graph._database = database
print("database:", graph._database)

# check nr nodes in the graph
graph.query("MATCH (n) RETURN count(n)")

database: abt-buy


[{'count(n)': 16168}]

### Error analysis

#### Analysis Errors on the training set

In [41]:
%%time

results = []

name_degree_threshold = 6
query = f"""
MATCH p1=(i1:Item)-[:HAS_NAME]->(n1:Name)-[:HAS_WORD]->(w:Word)<-[:HAS_WORD]-(n2:Name)<-[:HAS_NAME]-(i2:Item)
WHERE n1 <> n2
AND w.name_degree < {threshold}
RETURN i1.subject_id, i2.subject_id"""
df_p = pd.DataFrame(graph.query(query))

if len(df_p) > 0:

    df_train_p = df_train.merge(df_p, left_on=['source_id', 'target_id'], right_on=['i1.subject_id', 'i2.subject_id'], how='left')
    df_train_p['p'] = df_train_p['i1.subject_id'] > 0

    prec, recall, fscore, support = precision_recall_fscore_support(df_train_p['matching'], df_train_p['p'], average='binary')

    results.append({'threshold': name_degree_threshold, 'model': f'shared_word_threshold_{name_degree_threshold}', 'prec': prec, 'recall': recall, 'fscore': fscore})

df_results = pd.DataFrame(results).sort_values('fscore', ascending=False)
df_results

CPU times: user 143 ms, sys: 19.1 ms, total: 162 ms
Wall time: 229 ms


Unnamed: 0,threshold,model,prec,recall,fscore
0,6,shared_word_threshold_6,0.835427,0.69055,0.756111


* Interestingly, fscore on the trainingset (0.756) is lower than on the validation set (0.789)


#### Mark errors in the graph

In [42]:
cond_y = (df_train_p['matching'] == True)
cond_p = (df_train_p['i1.subject_id'] > 0)

errors = {
    'tp': cond_y & cond_p,
    'fp': ~cond_y & cond_p,
    'tn': ~cond_y & ~cond_p,
    'fn': cond_y & ~cond_p
}

for error, cond_error in errors.items():
    df_train_p.loc[cond_error, 'error_train'] = error
    
df_train_p['error_train'].value_counts().to_frame().T

Unnamed: 0,tn,tp,fn,fp
error_train,4146,665,298,131


In [53]:
%%time

for i, (idx, row) in enumerate(df_train_p.iterrows()):
    if i % 250 == 0:
        print(f"{i}/{len(df_train_p)}")

    query = f"""
    MATCH (i1:Item)-[r]->(i2:Item) 
    WHERE i1.subject_id = {row['source_id']} 
    AND i2.subject_id={row['target_id']}
    MERGE (i1)-[:TRAIN_{row['error_train'].upper()}]->(i2)
    RETURN count(distinct r)
    """
    graph.query(query)

0/5240
250/5240
500/5240
750/5240
1000/5240
1250/5240
1500/5240
1750/5240
2000/5240
2250/5240
2500/5240
2750/5240
3000/5240
3250/5240
3500/5240
3750/5240
4000/5240
4250/5240
4500/5240
4750/5240
5000/5240


#### Examples of errors

<h4>Example of a FP</h4>
<ul>
    <li><i>Items share words "Plain" and "Fax/Copier" with name_degree 4</i></li>
    <li><i>Model IDS as very different: "KX-FP145" vs "KXFG2451"</i></li>    
</ul>
<img src="../images/3-example-train-fp.jpg">
<hr>
<h4>Example of FN</h4>
<ul><li><i>Model ID is different due to a separator "RDRVX560" vs "RDR-VX560"</i></li></ul>
<img src="../images/4.1-example-train-fn.jpg">
<h4>Example of FN</h4>
<ul>
    <li><i>Word matching is still case-senstive! "Black" is not matched to "BLACK"</i></li>
    <li><i>Model IDS are written differently: "IH9BR" vs "IH9B6R"</i></li>
    </ul>
<img src="../images/4.2-example-train-fn.jpg">

### Add preprocessed Words

* Keep the original, but add (:WordLower) to see if performances increases

In [84]:
%%time
# Wall time: 8.5pre9 s

query = "MATCH (w:Word) MERGE (ww:WordLower {value: toLower(w.value)}) MERGE (w)-[:TO_LOWER]->(ww)"
graph.query(query)

CPU times: user 2.75 ms, sys: 1.74 ms, total: 4.49 ms
Wall time: 8.59 s


[]

In [85]:
query = """

MATCH (n)-[r:HAS_WORD]->(w:Word)-[:TO_LOWER]->(ww:WordLower)
MERGE (n)-[:HAS_WORD_LOWER]->(ww)
RETURN count(distinct r)"""

graph.query(query)

[{'count(distinct r)': 60007}]

In [89]:
%%time

query = "MATCH p=(ww:WordLower)<-[:HAS_WORD_LOWER]-(n:Name) WITH ww, count(n) AS name_degree SET ww.name_degree = name_degree;"
graph.query(query)

query = "MATCH p=(ww:WordLower)<-[:HAS_WORD_LOWER]-(n:Description) WITH ww, count(n) AS description_degree SET ww.description_degree = description_degree;"
graph.query(query)

CPU times: user 4.69 ms, sys: 3.08 ms, total: 7.77 ms
Wall time: 122 ms


[]

In [90]:
query = "MATCH (ww:WordLower) SET ww.display_value = ww.value + ' (' + ww.name_degree + ')' "
graph.query(query)

[]

<h4>Example of a FN with shared preprocessed words </h4>
<ul>
    <li><i>Left: Model ID can now be matched, this will correct the error</i></li>
    <li><i>Right: Very generic word, preprocessing will not correct the error</i></li></ul>        
</ul>
<img src="../images/4.4-example-train-fn-lowercase.jpg">


<h4>Example of a TN with shared preprocessed words </h4>
<ul>
    <li><i>The model claims this is TN because no rare words are shared (rarity: name_degree < 6)</i></li>
    <li><i>However, after pre-processing these items share the word "6-disc" which is only shared between these items (name_degree = 2)</i></li>
    <li><i>As these items should not be matched, this will result in a FP if you rerun the model with preprocessed words</i></li> 
</ul>
<img src="../images/5.example-train-tn-lowercase.jpg">


