# Entity Matching using Neo4j - Voting

_Salomon Tetelepta, May 14th 2024_

* There might be different and conflicting indications that a pair is a match or not. 
* This notebook explores how we can model the different "rules" into votes that together will result in a final classification. 

### Install dependencies

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
!pip install neo4j python-dotenv langchain-community --quiet

%load_ext watermark
%watermark -p neo4j

neo4j: 5.17.0



### Imports

In [20]:
from dotenv import load_dotenv, find_dotenv, dotenv_values
from langchain_community.graphs import Neo4jGraph
from pathlib import Path
from sklearn.manifold import TSNE
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from wordfreq import lossy_tokenize, tokenize, word_frequency

import json
import matplotlib.pyplot as plt
import neo4j
import numpy as np

import os
import pandas as pd
import pickle
import re

from utils import get_model_results

### Settings

In [11]:
# path settings
project_path = Path(os.getcwd()).parent
data_path = project_path / "data"
output_path = project_path / "output"

database = "abt-buy"

# load env settings
load_dotenv(project_path / ".env")

# reproducability
np.random.seed(42)

### Helper functions

### 1. Load Data

In [12]:
os.listdir(data_path / 'abt-buy' / 'record_descriptions')

['2_buy.csv', '1_abt.csv']

In [13]:
# abt and buy records
df_abt = pd.read_csv(data_path / 'abt-buy' / 'record_descriptions' / '1_abt.csv', encoding='unicode_escape')
df_buy = pd.read_csv(data_path / 'abt-buy' / 'record_descriptions' / '2_buy.csv')

# matches - train and validation set
df_train = pd.read_csv(data_path / 'abt-buy' / 'gs_train.csv')
df_val = pd.read_csv(data_path / 'abt-buy' / 'gs_val.csv')
df_test = pd.read_csv(data_path / 'abt-buy' / 'gs_test.csv')

# merge records with matches
df_abt_merged = df_abt.merge(df_train, left_on='subject_id', right_on='source_id', how='right')
df_train_merged = df_buy.merge(df_abt_merged, left_on='subject_id', right_on='target_id', how='right')

df_train_merged.head(3)

Unnamed: 0,subject_id_x,name_x,description_x,manufacturer,price_x,subject_id_y,name_y,description_y,price_y,source_id,target_id,matching
0,207390654,Sony Handycam HDR-SR10 High Definition Digital...,16:9 - 2.7' Hybrid LCD,Sony,549.0,33161,Sony High Definition HDV Handycam Camcorder - ...,Sony High Definition HDV Handycam Camcorder - ...,,33161,207390654,False
1,208085180,Pioneer DEH-2000MP Car Audio Player,"CD-RW - CD-Text, MP3, WMA, WAV - LCD - 4 - 200...",Pioneer,84.0,36258,D-Link Broadband Cable Modem - DCM202,D-Link Broadband Cable Modem - DCM202/ DOCSIS ...,79.0,36258,208085180,False
2,90125786,Sanus Wall/Ceiling Speaker Mount - WMS3S SILVER,Plastic - 8 lb,Sanus,,17417,Sanus 13' - 30' VisionMount Flat Panel TV Silv...,Sanus 13' - 30' VisionMount Flat Panel TV Silv...,39.99,17417,90125786,False


### Connect to Neo4j

In [43]:
# connect to Neo4j
graph = Neo4jGraph(
    url=os.getenv('NEO4J_URL'),
    username=os.getenv('NEO4J_USER'),
    password=os.getenv('NEO4J_PASS')
)

# create database if does not exist
graph._database = "system"
query = f"CREATE DATABASE `{database}` IF NOT EXISTS"
graph.query(query)

# change to target database
graph._database = database
print("database:", graph._database)

# check nr nodes in the graph
graph.query("MATCH (n) RETURN count(n)")

database: abt-buy


[{'count(n)': 614069}]

### Load leaderboard

In [44]:
df_results_all = pd.read_csv(output_path / 'results.csv')
df_results_all.head()

Unnamed: 0,model,threshold,prec,recall,fscore,evaluated_on,tp,fp,fn,tn
0,shared_word_lower_common_no_numerical_difference,6,0.91111,0.80921,0.85714,testset,123,12,589,29
1,shared_word_lower_common_difference,6,0.90511,0.81046,0.85517,testset,124,13,588,29
2,shared_word_lower_common_no_numerical_difference,6,0.88356,0.81646,0.84868,testset,129,17,584,29
3,shared_word_lower_threshold_6gram,"(6, 47)",0.80198,0.84375,0.822335,testset,162,40,30,574
4,shared_word_lower_threshold_5gram,"(6, 47)",0.727273,0.880383,0.796537,testset,184,69,25,559


In [49]:
threshold_low = 6

queries = [
    "MATCH p=()-[r:VOTE_MATCH_RULE1]->() DELETE r",
    "MATCH p=()-[r:VOTE_MATCH_RULE2]->() DELETE r",
    "MATCH p=()-[r:VOTE_MATCH_RULE3]->() DELETE r",
    "MATCH p=()-[r:VOTE_NO_MATCH_RULE4]->() DELETE r",
    "MATCH p=()-[r:VOTE_NO_MATCH_RULE5]->() DELETE r",
]
for query in queries:
    graph.query(query)

models = [
    {
        
        'model_name': 'vote-shared_word_threshold',
        'threshold': threshold_low,
        'query': f"""
            MATCH p1=(i1:Item {{source: 'abt'}})-[:HAS_NAME]->(n1:Name)-[:HAS_WORD_LOWER]->(w:WordLower)<-[:HAS_WORD_LOWER]-(n2:Name)<-[:HAS_NAME]-(i2:Item {{source: 'buy'}})
            WHERE n1 <> n2
            AND w.name_degree < {threshold_low}
            MERGE (i1)-[:VOTE_MATCH_RULE1 {{ rule: 1, match: 1, shared_word: w.value, name_degree: w.name_degree}}]->(i2)
            RETURN i1.subject_id, i2.subject_id
        """
    },
    {
        
        'model_name': 'vote-common_difference',
        'threshold': threshold_low,
        'query': f"""
            MATCH p1=(i1:Item {{source: 'abt'}})-[:HAS_NAME]-(n1:Name)-[:HAS_WORD_LOWER]->(w1:WordLower)-[:HAS_DIFFERENCE]->(d:Difference)<-[:HAS_DIFFERENCE]-(w2:WordLower)<-[:HAS_WORD_LOWER]-(n2:Name)<-[:HAS_NAME]-(i2:Item {{source: 'buy'}})
            WITH p1, d, i1, i2
            MATCH p2=(d)-[:HAS_DIFFERENCE_VALUE]-(dv:DifferenceValue)
            WHERE dv.p_is_match > 0.5
            MERGE (i1)-[:VOTE_MATCH_RULE2 {{ rule: 2, match: 1, sorted_words: d.sorted_words, difference: d.value, p_is_match: dv.p_is_match}}]->(i2)            
            RETURN i1.subject_id, i2.subject_id
            """
    },
    {
        'model_name': 'vote-no_numerical_difference',
        'threshold': threshold_low,
        'query': f"""
            MATCH p1=(i1:Item {{source: 'abt'}})-[:HAS_NAME]-(n1:Name)-[:HAS_WORD_LOWER]->(w1:WordLower)-[:HAS_DIFFERENCE]->(d:Difference)<-[:HAS_DIFFERENCE]-(w2:WordLower)<-[:HAS_WORD_LOWER]-(n2:Name)<-[:HAS_NAME]-(i2:Item {{source: 'buy'}})
            WITH p1, d, i1, i2
            MATCH p2=(d)-[:HAS_DIFFERENCE_VALUE]-(dv:DifferenceValue)
            WHERE dv.p_is_match > 0.5
            AND NOT dv.has_numbers = 1
            MERGE (i1)-[:VOTE_MATCH_RULE3 {{ rule: 3, match: 1, sorted_words: d.sorted_words, difference: d.value, p_is_match: dv.p_is_match, has_numbers: dv.has_numbers}}]->(i2)   
            RETURN i1.subject_id, i2.subject_id
            """
    },
    {
        'model_name': 'vote-numerical_difference',
        'threshold': threshold_low,
        'query': f"""
            MATCH p1=(i1:Item {{source: 'abt'}})-[:HAS_NAME]-(n1:Name)-[:HAS_WORD_LOWER]->(w1:WordLower)-[:HAS_DIFFERENCE]->(d:Difference)<-[:HAS_DIFFERENCE]-(w2:WordLower)<-[:HAS_WORD_LOWER]-(n2:Name)<-[:HAS_NAME]-(i2:Item {{source: 'buy'}})
            WITH p1, d, i1, i2
            MATCH p2=(d)-[:HAS_DIFFERENCE_VALUE]-(dv:DifferenceValue)
            WHERE dv.p_is_match < 0.2
            AND dv.has_numbers = 1
            MERGE (i1)-[:VOTE_NO_MATCH_RULE4 {{ rule: 4, match: 0, sorted_words: d.sorted_words, difference: d.value, p_is_match: dv.p_is_match, has_numbers: dv.has_numbers}}]->(i2)   
            RETURN i1.subject_id, i2.subject_id
            """
    }
]
    
for model in models:
    print(model['model_name'])
    print(model['query'])
    
    df_results_train, _ = get_model_results(
                      graph,
                      df_train, 
                      query=model['query'], 
                      model_name=model['model_name'], 
                      threshold=model['threshold'], 
                      evaluated_on="trainingset")

    df_results_test, _ = get_model_results(
                      graph,
                      df_test, 
                      query=model['query'], 
                      model_name=model['model_name'],
                      threshold=model['threshold'], 
                      evaluated_on="testset")

    display(df_results_train)
    display(df_results_test)
    df_results_all = pd.concat([df_results_train, df_results_all]).sort_values(['evaluated_on', 'fscore'], ascending=[True, False]).reset_index(drop=True).drop_duplicates()
    
    df_results_all = pd.concat([df_results_test, df_results_all]).sort_values(['evaluated_on','fscore'], ascending=[True, False]).reset_index(drop=True).drop_duplicates()

vote-shared_word_threshold

            MATCH p1=(i1:Item {source: 'abt'})-[:HAS_NAME]->(n1:Name)-[:HAS_WORD_LOWER]->(w:WordLower)<-[:HAS_WORD_LOWER]-(n2:Name)<-[:HAS_NAME]-(i2:Item {source: 'buy'})
            WHERE n1 <> n2
            AND w.name_degree < 6
            MERGE (i1)-[:VOTE_MATCH_RULE1 { rule: 1, match: 1, shared_word: w.value, name_degree: w.name_degree}]->(i2)
            RETURN i1.subject_id, i2.subject_id
        


Unnamed: 0,model,threshold,prec,recall,fscore,evaluated_on,tp,fp,fn,tn
0,vote-shared_word_threshold,6,0.83656,0.70727,0.7665,trainingset,691,135,4142,286


Unnamed: 0,model,threshold,prec,recall,fscore,evaluated_on,tp,fp,fn,tn
0,vote-shared_word_threshold,6,0.89623,0.67376,0.76923,testset,95,11,590,46


vote-common_difference

            MATCH p1=(i1:Item {source: 'abt'})-[:HAS_NAME]-(n1:Name)-[:HAS_WORD_LOWER]->(w1:WordLower)-[:HAS_DIFFERENCE]->(d:Difference)<-[:HAS_DIFFERENCE]-(w2:WordLower)<-[:HAS_WORD_LOWER]-(n2:Name)<-[:HAS_NAME]-(i2:Item {source: 'buy'})
            WITH p1, d, i1, i2
            MATCH p2=(d)-[:HAS_DIFFERENCE_VALUE]-(dv:DifferenceValue)
            WHERE dv.p_is_match > 0.5
            MERGE (i1)-[:VOTE_MATCH_RULE2 { rule: 2, match: 1, sorted_words: d.sorted_words, difference: d.value, p_is_match: dv.p_is_match}]->(i2)            
            RETURN i1.subject_id, i2.subject_id
            


Unnamed: 0,model,threshold,prec,recall,fscore,evaluated_on,tp,fp,fn,tn
0,vote-common_difference,6,0.94242,0.69342,0.79897,trainingset,622,38,4210,275


Unnamed: 0,model,threshold,prec,recall,fscore,evaluated_on,tp,fp,fn,tn
0,vote-common_difference,6,0.85366,0.30973,0.45455,testset,35,6,595,78


vote-no_numerical_difference

            MATCH p1=(i1:Item {source: 'abt'})-[:HAS_NAME]-(n1:Name)-[:HAS_WORD_LOWER]->(w1:WordLower)-[:HAS_DIFFERENCE]->(d:Difference)<-[:HAS_DIFFERENCE]-(w2:WordLower)<-[:HAS_WORD_LOWER]-(n2:Name)<-[:HAS_NAME]-(i2:Item {source: 'buy'})
            WITH p1, d, i1, i2
            MATCH p2=(d)-[:HAS_DIFFERENCE_VALUE]-(dv:DifferenceValue)
            WHERE dv.p_is_match > 0.5
            AND NOT dv.has_numbers = 1
            MERGE (i1)-[:VOTE_MATCH_RULE3 { rule: 3, match: 1, sorted_words: d.sorted_words, difference: d.value, p_is_match: dv.p_is_match, has_numbers: dv.has_numbers}]->(i2)   
            RETURN i1.subject_id, i2.subject_id
            


Unnamed: 0,model,threshold,prec,recall,fscore,evaluated_on,tp,fp,fn,tn
0,vote-no_numerical_difference,6,0.93729,0.64527,0.76434,trainingset,553,37,4211,304


Unnamed: 0,model,threshold,prec,recall,fscore,evaluated_on,tp,fp,fn,tn
0,vote-no_numerical_difference,6,0.89474,0.30088,0.45033,testset,34,4,597,79


vote-numerical_difference

            MATCH p1=(i1:Item {source: 'abt'})-[:HAS_NAME]-(n1:Name)-[:HAS_WORD_LOWER]->(w1:WordLower)-[:HAS_DIFFERENCE]->(d:Difference)<-[:HAS_DIFFERENCE]-(w2:WordLower)<-[:HAS_WORD_LOWER]-(n2:Name)<-[:HAS_NAME]-(i2:Item {source: 'buy'})
            WITH p1, d, i1, i2
            MATCH p2=(d)-[:HAS_DIFFERENCE_VALUE]-(dv:DifferenceValue)
            WHERE dv.p_is_match < 0.2
            AND dv.has_numbers = 1
            MERGE (i1)-[:VOTE_NO_MATCH_RULE4 { rule: 4, match: 0, sorted_words: d.sorted_words, difference: d.value, p_is_match: dv.p_is_match, has_numbers: dv.has_numbers}]->(i2)   
            RETURN i1.subject_id, i2.subject_id
            


Unnamed: 0,model,threshold,prec,recall,fscore,evaluated_on,tp,fp,fn,tn
0,vote-numerical_difference,6,0.00306,0.00524,0.00386,trainingset,4,1305,3395,760


Unnamed: 0,model,threshold,prec,recall,fscore,evaluated_on,tp,fp,fn,tn
0,vote-numerical_difference,6,0.08333,0.03604,0.05031,testset,4,44,566,107


_Example of a pair with multiple, conflicting votes_
* Vote match rule 1: match because shared word `hi-fi` has a name_degree of 3
* Vote match rule 2: match because shared word `sennheiser` and `senheisser` often result in a math (63.2%)
* Vote match rule 3: match because difference is common (prob match: 0.632)
* Vote match rule 4: no match because difference [2,3] is never a True match
<img src="../images/11.1-example-voting.jpg?2" width="600">