# Entity Matching using Neo4j - Crucial Differences

_Salomon Tetelepta, May 8th 2024_
* Explore how to model crucial diffences
* Match: What you share is __specific__, what differs is __common__
    * _ie. same model identifier, differs by a common separator_
* No-Match: What you share is __common__, what differs is __specific__
    * _ie. same manufacturer, different model identifier_
* No-Match: What you share is __specific__, what differs is a __crucial difference__
    * _ie. numerical difference, v12 vs v13_

<img src="../images/8-crucial-differences-principle.jpg" width="600">

### Install dependencies

In [2]:
!pip install neo4j python-dotenv langchain-community --quiet

%load_ext watermark
%watermark -p neo4j

neo4j: 5.17.0



### Imports

In [3]:
from dotenv import load_dotenv, find_dotenv, dotenv_values
from langchain_community.graphs import Neo4jGraph
from pathlib import Path
from sklearn.manifold import TSNE
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from wordfreq import lossy_tokenize, tokenize, word_frequency

import json
import matplotlib.pyplot as plt
import neo4j
import numpy as np

import os
import pandas as pd
import pickle
import re

### Settings

In [4]:
# path settings
project_path = Path(os.getcwd()).parent
data_path = project_path / "data"
output_path = project_path / "output"

database = "abt-buy"

# load env settings
load_dotenv(project_path / ".env")

# reproducability
np.random.seed(42)

### Helper functions

In [119]:
def longest_common_substring(str1, str2):
    # source gpt 3.5
    
    m = len(str1)
    n = len(str2)
    
    # Create a table to store lengths of longest common suffixes
    # of substrings. Initialize the table to zeros.
    lcs_table = [[0] * (n + 1) for _ in range(m + 1)]
    
    # Variables to store the length of the longest common substring
    # and the ending position of the longest common substring in str1.
    longest_length = 0
    ending_position = 0
    
    # Fill the lcs_table in bottom-up manner
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if str1[i - 1] == str2[j - 1]:
                lcs_table[i][j] = lcs_table[i - 1][j - 1] + 1
                if lcs_table[i][j] > longest_length:
                    longest_length = lcs_table[i][j]
                    ending_position = i
            else:
                lcs_table[i][j] = 0
    
    # Extract the longest common substring from str1
    longest_substring = str1[ending_position - longest_length:ending_position]
    
    return longest_substring

def get_differences_and_overlap_recursive(str1, str2, sep):
    # source: gpt 3.5
    
    # Find the longest common substring
    overlap = longest_common_substring(str1, str2)
    
    # If there is an overlap, recursively find differences and overlap again
    if overlap:
        # Find the indices of the overlap in both strings
        overlap_index1 = str1.find(overlap)
        overlap_index2 = str2.find(overlap)
        
        # Split the strings at the overlap indices
        str1_before_overlap = str1[:overlap_index1]
        str1_after_overlap = str1[overlap_index1 + len(overlap):]
        
        str2_before_overlap = str2[:overlap_index2]
        str2_after_overlap = str2[overlap_index2 + len(overlap):]
        
        # Recursively find differences and overlap for the substrings
        differences_before = get_differences_and_overlap_recursive(str1_before_overlap, str2_before_overlap, sep=sep)
        differences_after = get_differences_and_overlap_recursive(str1_after_overlap, str2_after_overlap, sep=sep)
        
        # Merge the differences and overlaps
        differences = differences_before["differences"] + differences_after["differences"]
        overlap = overlap + "<SEP!>" + differences_before["overlap_str"] + differences_after["overlap_str"]
    else:
        # If there is no overlap, set the differences to the input strings
        differences = [str1, str2]
    
    return {
        "differences": [d for d in differences if d != ""],
        "overlap_str": overlap,
    }


def get_differences_and_overlap(str1, str2, sep="<SEP!>"):
    # slight adaptation to gtp 3.5 code
    
    result = get_differences_and_overlap_recursive(str1, str2, sep=sep)
    # post-processing to get overlap as list
    result['overlap'] = [o for o in result['overlap_str'].split(sep) if o != ""]
    del result['overlap_str']
    return result

# # Example usage
# str1 = "ps-lx350h"
# str2 = "pslx350h"
# result = get_differences_and_overlap(str1, str2)

# print("Difference:", result["differences"])
# print("Overlap:", result["overlap"])

In [166]:
def has_numbers(s):
    return bool(re.search(r'\d', s))

### 1. Load Data

In [120]:
os.listdir(data_path / 'abt-buy' / 'record_descriptions')

['2_buy.csv', '1_abt.csv']

In [121]:
# abt and buy records
df_abt = pd.read_csv(data_path / 'abt-buy' / 'record_descriptions' / '1_abt.csv', encoding='unicode_escape')
df_buy = pd.read_csv(data_path / 'abt-buy' / 'record_descriptions' / '2_buy.csv')

# matches - train and validation set
df_train = pd.read_csv(data_path / 'abt-buy' / 'gs_train.csv')
df_val = pd.read_csv(data_path / 'abt-buy' / 'gs_val.csv')
df_test = pd.read_csv(data_path / 'abt-buy' / 'gs_test.csv')

# merge records with matches
df_abt_merged = df_abt.merge(df_train, left_on='subject_id', right_on='source_id', how='right')
df_train_merged = df_buy.merge(df_abt_merged, left_on='subject_id', right_on='target_id', how='right')

df_train_merged.head(3)

Unnamed: 0,subject_id_x,name_x,description_x,manufacturer,price_x,subject_id_y,name_y,description_y,price_y,source_id,target_id,matching
0,207390654,Sony Handycam HDR-SR10 High Definition Digital...,16:9 - 2.7' Hybrid LCD,Sony,549.0,33161,Sony High Definition HDV Handycam Camcorder - ...,Sony High Definition HDV Handycam Camcorder - ...,,33161,207390654,False
1,208085180,Pioneer DEH-2000MP Car Audio Player,"CD-RW - CD-Text, MP3, WMA, WAV - LCD - 4 - 200...",Pioneer,84.0,36258,D-Link Broadband Cable Modem - DCM202,D-Link Broadband Cable Modem - DCM202/ DOCSIS ...,79.0,36258,208085180,False
2,90125786,Sanus Wall/Ceiling Speaker Mount - WMS3S SILVER,Plastic - 8 lb,Sanus,,17417,Sanus 13' - 30' VisionMount Flat Panel TV Silv...,Sanus 13' - 30' VisionMount Flat Panel TV Silv...,39.99,17417,90125786,False


### Connect to Neo4j

In [122]:
# connect to Neo4j
graph = Neo4jGraph(
    url=os.getenv('NEO4J_URL'),
    username=os.getenv('NEO4J_USER'),
    password=os.getenv('NEO4J_PASS')
)

# create database if does not exist
graph._database = "system"
query = f"CREATE DATABASE `{database}` IF NOT EXISTS"
graph.query(query)

# change to target database
graph._database = database
print("database:", graph._database)

# check nr nodes in the graph
graph.query("MATCH (n) RETURN count(n)")

database: abt-buy


[{'count(n)': 612119}]

### Determine overlap and difference for words sharing rare ngrams

Examples:
* `kx-tga820b` vs `kxtga820b` -> overlap: [`kx`, `tga820b`], differences: [`-`]	
* `rxv663bl` vs `rxv663bk` -> overlap: [`rxv663b`], differences: [`l`, `k`]

#### Get pairs that share large ngrams - get overlap and differences

In [102]:
%%time
# Wall time: 1.91 s

query = "MATCH p=(i1:Item)-[:HAS_NAME]->(n1:Name)-[:HAS_WORD_LOWER]->(w1:WordLower)-[:HAS_6GRAM]->(:NoVocab)<-[:HAS_6GRAM]-(w2:WordLower)<-[:HAS_WORD_LOWER]-(:Name)<-[:HAS_NAME]-(i2:Item)-[:NO_MATCH]-(i1) RETURN DISTINCT w1.value AS w1, w2.value AS w2, i1"
results = graph.query(query)

df_results = pd.DataFrame(results)
df_diff_overlap = pd.DataFrame(results).apply(lambda x: pd.Series(get_differences_and_overlap(x[0], x[1])), axis=1)
df_results_no_match = pd.concat([df_results, df_diff_overlap], axis=1)
df_results_no_match['is_match'] = 0

CPU times: user 90.1 ms, sys: 4.99 ms, total: 95.1 ms
Wall time: 185 ms


In [103]:
%%time
# Wall time: 1.91 s

query = "MATCH p=(i1:Item)-[:HAS_NAME]->(n1:Name)-[:HAS_WORD_LOWER]->(w1:WordLower)-[:HAS_6GRAM]->(:NoVocab)<-[:HAS_6GRAM]-(w2:WordLower)<-[:HAS_WORD_LOWER]-(:Name)<-[:HAS_NAME]-(i2:Item)-[:IS_MATCH]-(i1) RETURN DISTINCT w1.value AS w1, w2.value AS w2, i1"
results = graph.query(query)

df_results = pd.DataFrame(results)
df_diff_overlap = pd.DataFrame(results).apply(lambda x: pd.Series(get_differences_and_overlap(x[0], x[1])), axis=1)
df_results_is_match = pd.concat([df_results, df_diff_overlap], axis=1)
df_results_is_match['is_match'] = 1

CPU times: user 108 ms, sys: 5.39 ms, total: 113 ms
Wall time: 138 ms


In [124]:
df_results = pd.concat([df_results_is_match, df_results_no_match], axis=0)
df_results

Unnamed: 0,w1,w2,i1,differences,overlap,is_match
0,kx-tga820b,kxtga820b,"{'subject_id': 206179521, 'source': 'buy'}",[-],"[tga820b, kx]",1
1,kxtga820b,kx-tga820b,"{'subject_id': 32986, 'source': 'abt'}",[-],"[tga820b, kx]",1
2,rxv663bl,rxv663bk,"{'subject_id': 207667994, 'source': 'buy'}","[l, k]",[rxv663b],1
3,rxv663bk,rxv663bl,"{'subject_id': 33452, 'source': 'abt'}","[k, l]",[rxv663b],1
4,th-46pz80u,th46pz80u,"{'subject_id': 207388759, 'source': 'buy'}",[-],"[46pz80u, th]",1
...,...,...,...,...,...,...
333,28-135mm,w/28-135mm,"{'subject_id': 30962, 'source': 'abt'}",[w/],[28-135mm],0
334,motorokr,motorola,"{'subject_id': 33804, 'source': 'abt'}","[kr, la]",[motoro],0
335,icfc1ipmk2wht,icfc1ipmk2blk,"{'subject_id': 34638, 'source': 'abt'}","[wht, blk]",[icfc1ipmk2],0
336,icfc1ipmk2blk,icfc1ipmk2whi,"{'subject_id': 34637, 'source': 'abt'}","[blk, whi]",[icfc1ipmk2],0


#### Probability of match given a difference string

In [200]:
df_results['differences_str'] = df_results['differences'].apply("".join)
cond_match = df_results.is_match == 1
df_diff_is_match_vc = df_results[cond_match]['differences_str'].value_counts().to_frame()
df_diff_no_match_vc = df_results[~cond_match]['differences_str'].value_counts().to_frame()

In [203]:
df_differences = df_diff_is_match_vc.merge(df_diff_no_match_vc, left_index=True, right_index=True, how='outer').rename(columns={'differences_str_x': 'is_match', 'differences_str_y': 'no_match'} ).fillna(0)
df_differences['sum'] = df_differences.sum(axis=1)
df_differences['p_is_match'] = df_differences['is_match'] / df_differences['sum']
df_differences['p_no_match'] = df_differences['no_match'] / df_differences['sum']
df_differences = df_differences.sort_values('is_match', ascending=False).reset_index(names='value')
df_differences

Unnamed: 0,value,is_match,no_match,sum,p_is_match,p_no_match
0,-,180.0,2.0,182.0,0.989011,0.010989
1,/,66.0,0.0,66.0,1.000000,0.000000
2,h,24.0,2.0,26.0,0.923077,0.076923
3,s,20.0,11.0,31.0,0.645161,0.354839
4,na,10.0,0.0,10.0,1.000000,0.000000
...,...,...,...,...,...,...
272,823979,0.0,1.0,1.0,0.000000,1.000000
273,8261,0.0,1.0,1.0,0.000000,1.000000
274,8263bs,0.0,1.0,1.0,0.000000,1.000000
275,8267,0.0,1.0,1.0,0.000000,1.000000


#### Mean probability of matches given a numeric value in the differentiating string

In [206]:
df_differences['has_numbers'] = df_differences['value'].apply(has_numbers)
df_differences[df_differences['has_numbers'] == True].sort_values('is_match', ascending=False)
df_differences.groupby('has_numbers')['p_is_match'].agg('mean').to_frame()

Unnamed: 0_level_0,p_is_match
has_numbers,Unnamed: 1_level_1
False,0.440746
True,0.118644
