# Entity Matching using Neo4j - Crucial Differences

_Salomon Tetelepta, May 8th 2024_
* Last change: May 10th 2024
* Explore how to model crucial diffences
* Match: What you share is __specific__, what differs is __common__
    * _ie. same model identifier, differs by a common separator_
* No-Match: What you share is __common__, what differs is __specific__
    * _ie. same manufacturer, different model identifier_
* No-Match: What you share is __specific__, what differs is a __crucial difference__
    * _ie. numerical difference, v12 vs v13_

<img src="../images/8-crucial-differences-principle.jpg" width="600">

### Install dependencies

In [2]:
!pip install neo4j python-dotenv langchain-community --quiet

%load_ext watermark
%watermark -p neo4j

neo4j: 5.17.0



### Imports

In [40]:
from dotenv import load_dotenv, find_dotenv, dotenv_values
from langchain_community.graphs import Neo4jGraph
from pathlib import Path
from sklearn.manifold import TSNE
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from wordfreq import lossy_tokenize, tokenize, word_frequency

import json
import matplotlib.pyplot as plt
import neo4j
import numpy as np

import os
import pandas as pd
import pickle
import re

from utils import get_model_results

### Settings

In [4]:
# path settings
project_path = Path(os.getcwd()).parent
data_path = project_path / "data"
output_path = project_path / "output"

database = "abt-buy"

# load env settings
load_dotenv(project_path / ".env")

# reproducability
np.random.seed(42)

### Helper functions

In [5]:
def longest_common_substring(str1, str2):
    # source gpt 3.5
    
    m = len(str1)
    n = len(str2)
    
    # Create a table to store lengths of longest common suffixes
    # of substrings. Initialize the table to zeros.
    lcs_table = [[0] * (n + 1) for _ in range(m + 1)]
    
    # Variables to store the length of the longest common substring
    # and the ending position of the longest common substring in str1.
    longest_length = 0
    ending_position = 0
    
    # Fill the lcs_table in bottom-up manner
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if str1[i - 1] == str2[j - 1]:
                lcs_table[i][j] = lcs_table[i - 1][j - 1] + 1
                if lcs_table[i][j] > longest_length:
                    longest_length = lcs_table[i][j]
                    ending_position = i
            else:
                lcs_table[i][j] = 0
    
    # Extract the longest common substring from str1
    longest_substring = str1[ending_position - longest_length:ending_position]
    
    return longest_substring

def get_differences_and_overlap_recursive(str1, str2, sep):
    # source: gpt 3.5
    
    # Find the longest common substring
    overlap = longest_common_substring(str1, str2)
    
    # If there is an overlap, recursively find differences and overlap again
    if overlap:
        # Find the indices of the overlap in both strings
        overlap_index1 = str1.find(overlap)
        overlap_index2 = str2.find(overlap)
        
        # Split the strings at the overlap indices
        str1_before_overlap = str1[:overlap_index1]
        str1_after_overlap = str1[overlap_index1 + len(overlap):]
        
        str2_before_overlap = str2[:overlap_index2]
        str2_after_overlap = str2[overlap_index2 + len(overlap):]
        
        # Recursively find differences and overlap for the substrings
        differences_before = get_differences_and_overlap_recursive(str1_before_overlap, str2_before_overlap, sep=sep)
        differences_after = get_differences_and_overlap_recursive(str1_after_overlap, str2_after_overlap, sep=sep)
        
        # Merge the differences and overlaps
        differences = differences_before["differences"] + differences_after["differences"]
        overlap = overlap + "<SEP!>" + differences_before["overlap_str"] + differences_after["overlap_str"]
    else:
        # If there is no overlap, set the differences to the input strings
        differences = [str1, str2]
    
    return {
        "differences": [d for d in differences if d != ""],
        "overlap_str": overlap,
    }


def get_differences_and_overlap(str1, str2, sep="<SEP!>"):
    # slight adaptation to gpt 3.5 code
    
    result = get_differences_and_overlap_recursive(str1, str2, sep=sep)
    # post-processing to get overlap as list
    result['overlap'] = [o for o in result['overlap_str'].split(sep) if o != ""]
    del result['overlap_str']
    return result

# # Example usage
# str1 = "ps-lx350h"
# str2 = "pslx350h"
# result = get_differences_and_overlap(str1, str2)

# print("Difference:", result["differences"])
# print("Overlap:", result["overlap"])

In [6]:
def has_numbers(s):
    return bool(re.search(r'\d', s))

### 1. Load Data

In [8]:
os.listdir(data_path / 'abt-buy' / 'record_descriptions')

['2_buy.csv', '1_abt.csv']

In [9]:
# abt and buy records
df_abt = pd.read_csv(data_path / 'abt-buy' / 'record_descriptions' / '1_abt.csv', encoding='unicode_escape')
df_buy = pd.read_csv(data_path / 'abt-buy' / 'record_descriptions' / '2_buy.csv')

# matches - train and validation set
df_train = pd.read_csv(data_path / 'abt-buy' / 'gs_train.csv')
df_val = pd.read_csv(data_path / 'abt-buy' / 'gs_val.csv')
df_test = pd.read_csv(data_path / 'abt-buy' / 'gs_test.csv')

# merge records with matches
df_abt_merged = df_abt.merge(df_train, left_on='subject_id', right_on='source_id', how='right')
df_train_merged = df_buy.merge(df_abt_merged, left_on='subject_id', right_on='target_id', how='right')

df_train_merged.head(3)

Unnamed: 0,subject_id_x,name_x,description_x,manufacturer,price_x,subject_id_y,name_y,description_y,price_y,source_id,target_id,matching
0,207390654,Sony Handycam HDR-SR10 High Definition Digital...,16:9 - 2.7' Hybrid LCD,Sony,549.0,33161,Sony High Definition HDV Handycam Camcorder - ...,Sony High Definition HDV Handycam Camcorder - ...,,33161,207390654,False
1,208085180,Pioneer DEH-2000MP Car Audio Player,"CD-RW - CD-Text, MP3, WMA, WAV - LCD - 4 - 200...",Pioneer,84.0,36258,D-Link Broadband Cable Modem - DCM202,D-Link Broadband Cable Modem - DCM202/ DOCSIS ...,79.0,36258,208085180,False
2,90125786,Sanus Wall/Ceiling Speaker Mount - WMS3S SILVER,Plastic - 8 lb,Sanus,,17417,Sanus 13' - 30' VisionMount Flat Panel TV Silv...,Sanus 13' - 30' VisionMount Flat Panel TV Silv...,39.99,17417,90125786,False


### Connect to Neo4j

In [50]:
# connect to Neo4j
graph = Neo4jGraph(
    url=os.getenv('NEO4J_URL'),
    username=os.getenv('NEO4J_USER'),
    password=os.getenv('NEO4J_PASS')
)

# create database if does not exist
graph._database = "system"
query = f"CREATE DATABASE `{database}` IF NOT EXISTS"
graph.query(query)

# change to target database
graph._database = database
print("database:", graph._database)

# check nr nodes in the graph
graph.query("MATCH (n) RETURN count(n)")

database: abt-buy


[{'count(n)': 614069}]

### Determine overlap and difference for words sharing rare ngrams

Examples:
* `kx-tga820b` vs `kxtga820b` -> overlap: [`kx`, `tga820b`], differences: [`-`]	
* `rxv663bl` vs `rxv663bk` -> overlap: [`rxv663b`], differences: [`l`, `k`]

#### [TRAIN] Get pairs that share large ngrams - get overlap and differences

In [51]:
%%time
# Wall time: 1.91 s

query = "MATCH p=(i1:Item {source: 'abt'})-[:HAS_NAME]->(n1:Name)-[:HAS_WORD_LOWER]->(w1:WordLower)-[:HAS_3GRAM|HAS_4GRAM|HAS_5GRAM|HAS_6GRAM]->(:NoVocab)<-[:HAS_3GRAM|HAS_4GRAM|HAS_5GRAM|HAS_6GRAM]-(w2:WordLower)<-[:HAS_WORD_LOWER]-(:Name)<-[:HAS_NAME]-(i2:Item)-[:NO_MATCH]-(i1) RETURN DISTINCT w1.value AS w1, w2.value AS w2, i1"
results = graph.query(query)

df_results = pd.DataFrame(results)
df_diff_overlap = pd.DataFrame(results).apply(lambda x: pd.Series(get_differences_and_overlap(x[0], x[1])), axis=1)
df_results_no_match = pd.concat([df_results, df_diff_overlap], axis=1)
df_results_no_match['is_match'] = 0

CPU times: user 283 ms, sys: 8.5 ms, total: 292 ms
Wall time: 474 ms


In [52]:
%%time
# Wall time: 1.91 s

query = "MATCH p=(i1:Item {source: 'abt'})-[:HAS_NAME]->(n1:Name)-[:HAS_WORD_LOWER]->(w1:WordLower)-[:HAS_3GRAM|HAS_4GRAM|HAS_5GRAM|HAS_6GRAM]->(:NoVocab)<-[:HAS_3GRAM|HAS_4GRAM|HAS_5GRAM|HAS_6GRAM]-(w2:WordLower)<-[:HAS_WORD_LOWER]-(:Name)<-[:HAS_NAME]-(i2:Item)-[:IS_MATCH]-(i1) RETURN DISTINCT w1.value AS w1, w2.value AS w2, i1"
results = graph.query(query)

df_results = pd.DataFrame(results)
df_diff_overlap = pd.DataFrame(results).apply(lambda x: pd.Series(get_differences_and_overlap(x[0], x[1])), axis=1)
df_results_is_match = pd.concat([df_results, df_diff_overlap], axis=1)
df_results_is_match['is_match'] = 1

CPU times: user 122 ms, sys: 4.83 ms, total: 127 ms
Wall time: 189 ms


In [54]:
df_results_train = pd.concat([df_results_is_match, df_results_no_match], axis=0)
df_results_train

Unnamed: 0,w1,w2,i1,differences,overlap,is_match
0,cdpce375,cdp-ce375,"{'subject_id': 5644, 'source': 'abt'}",[-],"[ce375, cdp]",1
1,161wh,161,"{'subject_id': 6284, 'source': 'abt'}",[wh],[161],1
2,tu1500rd,tu-1500rd,"{'subject_id': 6493, 'source': 'abt'}",[-],"[1500rd, tu]",1
3,mdrj10,mdrj10/blue,"{'subject_id': 7195, 'source': 'abt'}",[/blue],[mdrj10],1
4,kxts108w,kx-ts108w,"{'subject_id': 7936, 'source': 'abt'}",[-],"[ts108w, kx]",1
...,...,...,...,...,...,...
1448,910000619,dect2000,"{'subject_id': 37807, 'source': 'abt'}","[91, dect2, 0619]",[000],0
1449,hdtv,hdtv),"{'subject_id': 37934, 'source': 'abt'}",[)],[hdtv],0
1450,wga600n,gsd2400nww,"{'subject_id': 38449, 'source': 'abt'}","[w, a6, sd24, ww]","[00n, g]",0
1451,dma2100,1000,"{'subject_id': 38507, 'source': 'abt'}","[dma2, 0]",[100],0


#### [TRAIN] Probability of match given a difference string

In [55]:
df_results_train['differences_str'] = df_results_train['differences'].apply(";".join)
cond_match = df_results_train.is_match == 1
df_diff_is_match_vc = df_results_train[cond_match]['differences_str'].value_counts().to_frame()
df_diff_no_match_vc = df_results_train[~cond_match]['differences_str'].value_counts().to_frame()

In [56]:
df_differences = df_diff_is_match_vc.merge(df_diff_no_match_vc, left_index=True, right_index=True, how='outer').rename(columns={'differences_str_x': 'is_match', 'differences_str_y': 'no_match'} ).fillna(0)
df_differences['sum'] = df_differences.sum(axis=1)
df_differences['p_is_match'] = df_differences['is_match'] / df_differences['sum']
df_differences['p_no_match'] = df_differences['no_match'] / df_differences['sum']
df_differences = df_differences.sort_values('is_match', ascending=False).reset_index(names='value')
df_differences

Unnamed: 0,value,is_match,no_match,sum,p_is_match,p_no_match
0,-,213.0,10.0,223.0,0.955157,0.044843
1,/,33.0,0.0,33.0,1.000000,0.000000
2,-;-,29.0,0.0,29.0,1.000000,0.000000
3,h,14.0,2.0,16.0,0.875000,0.125000
4,s,12.0,7.0,19.0,0.631579,0.368421
...,...,...,...,...,...,...
1290,5;2;r;/p,0.0,1.0,1.0,0.000000,1.000000
1291,5;3,0.0,1.0,1.0,0.000000,1.000000
1292,-;bk;wh,0.0,1.0,1.0,0.000000,1.000000
1293,5;3;v,0.0,1.0,1.0,0.000000,1.000000


#### [TRAIN] Mean probability of matches given a numeric value in the differentiating string

In [57]:
df_differences['has_numbers'] = df_differences['value'].apply(has_numbers)
df_differences[df_differences['has_numbers'] == True].sort_values('is_match', ascending=False)
df_differences.groupby('has_numbers')['p_is_match'].agg('mean').to_frame()

Unnamed: 0_level_0,p_is_match
has_numbers,Unnamed: 1_level_1
False,0.414723
True,0.077057


### Add relationships between WordLower nodes to capture overlap and differences in the graph

* Using the training set we determined the probability of a match / no-match for strings that are different (Modelled in the graph as DifferentValues)
* To get results on the benchmark, we need to apply this to all the items in the dataset
    * Model differences between lowercase words as Nodes: Difference
        * `(w1:WordLower)-[:HAS_DIFFERENCE]->(d:Difference)<-[:HAS_DIFFERENCE]-(w2:WordLower)`
    * Model the value that makes the difference as DifferentValue, and create a relationship]
        * `(d:Difference)-[:HAS_DIFFERENCE_VALUE]-(dv:DifferenceValue)
        * Attributes for Difference Values:
            * value: concatenated list of values that makes the difference
            * degree: number of pair of words that have this value as difference
            * p_is_match: that probability that pairs with this difference is a match (training set)
            * p_no_match: that probability that pairs with this difference is not a match (training set)
            * has_numbers: flag to indicate if the difference contain numbers (indicative for non-matches)

#### [TEST] Get pairs that share large ngrams - get overlap and differences

In [58]:
%%time
# Wall time: 1.91 s

query = """
MATCH p=(i1:Item {source: 'abt'})-[:HAS_NAME]->(n1:Name)-[:HAS_WORD_LOWER]->(w1:WordLower)-[:HAS_6GRAM]->(:NoVocab)<-[:HAS_6GRAM]-(w2:WordLower)<-[:HAS_WORD_LOWER]-(:Name)<-[:HAS_NAME]-(i2:Item) 
WHERE NOT (i1)-[:IS_MATCH|NO_MATCH]->(i2)
RETURN DISTINCT w1.value AS w1, w2.value AS w2, i1"""
results_test = graph.query(query)

df_results_test = pd.DataFrame(results_test)
df_diff_overlap_test = pd.DataFrame(results_test).apply(lambda x: pd.Series(get_differences_and_overlap(x[0], x[1])), axis=1)
df_results_test = pd.concat([df_results_test, df_diff_overlap_test], axis=1)
df_results_test['is_match'] = -1
df_results_test['differences_str'] = df_results_test['differences'].apply(";".join)

CPU times: user 271 ms, sys: 8.42 ms, total: 280 ms
Wall time: 319 ms


#### Merge differences between WordLower Items with probabilities for the differences

In [59]:
df_results_train_test = pd.concat([df_results_train, df_results_test])

In [60]:
df_results_train_test

Unnamed: 0,w1,w2,i1,differences,overlap,is_match,differences_str
0,cdpce375,cdp-ce375,"{'subject_id': 5644, 'source': 'abt'}",[-],"[ce375, cdp]",1,-
1,161wh,161,"{'subject_id': 6284, 'source': 'abt'}",[wh],[161],1,wh
2,tu1500rd,tu-1500rd,"{'subject_id': 6493, 'source': 'abt'}",[-],"[1500rd, tu]",1,-
3,mdrj10,mdrj10/blue,"{'subject_id': 7195, 'source': 'abt'}",[/blue],[mdrj10],1,/blue
4,kxts108w,kx-ts108w,"{'subject_id': 7936, 'source': 'abt'}",[-],"[ts108w, kx]",1,-
...,...,...,...,...,...,...,...
1649,920000264,920000383,"{'subject_id': 39088, 'source': 'abt'}","[264, 383]",[920000],-1,264;383
1650,920000264,920000924,"{'subject_id': 39088, 'source': 'abt'}","[9, 6]","[920000, 2, 4]",-1,9;6
1651,981000040,981-000040,"{'subject_id': 39175, 'source': 'abt'}",[-],"[000040, 981]",-1,-
1652,wireless-g,wireless-nusb,"{'subject_id': 39179, 'source': 'abt'}","[g, nusb]",[wireless-],-1,g;nusb


In [61]:
df_results_merged = df_results_train_test.drop_duplicates(['w1', 'w2']).merge(df_differences.drop('is_match', axis=1), left_on='differences_str', right_on='value', how='left')
df_results_merged.drop_duplicates(['w1', 'w2'])

Unnamed: 0,w1,w2,i1,differences,overlap,is_match,differences_str,value,no_match,sum,p_is_match,p_no_match,has_numbers
0,cdpce375,cdp-ce375,"{'subject_id': 5644, 'source': 'abt'}",[-],"[ce375, cdp]",1,-,-,10.0,223.0,0.955157,0.044843,False
1,161wh,161,"{'subject_id': 6284, 'source': 'abt'}",[wh],[161],1,wh,wh,0.0,1.0,1.000000,0.000000,False
2,tu1500rd,tu-1500rd,"{'subject_id': 6493, 'source': 'abt'}",[-],"[1500rd, tu]",1,-,-,10.0,223.0,0.955157,0.044843,False
3,mdrj10,mdrj10/blue,"{'subject_id': 7195, 'source': 'abt'}",[/blue],[mdrj10],1,/blue,/blue,0.0,1.0,1.000000,0.000000,False
4,kxts108w,kx-ts108w,"{'subject_id': 7936, 'source': 'abt'}",[-],"[ts108w, kx]",1,-,-,10.0,223.0,0.955157,0.044843,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3077,0100057730,0100057600,"{'subject_id': 39039, 'source': 'abt'}","[73, 6, 0]","[0100057, 0]",-1,73;6;0,,,,,,
3078,0100057730,0100057610,"{'subject_id': 39039, 'source': 'abt'}","[73, 61]","[0100057, 0]",-1,73;61,,,,,,
3079,920000264,920000383,"{'subject_id': 39088, 'source': 'abt'}","[264, 383]",[920000],-1,264;383,,,,,,
3080,920000264,920000924,"{'subject_id': 39088, 'source': 'abt'}","[9, 6]","[920000, 2, 4]",-1,9;6,9;6,1.0,1.0,0.000000,1.000000,True


In [62]:
%%time
#Wall time: 15.2s

# delete if exists
query = "MATCH ()-[r:HAS_DIFFERENCE_VALUE]-() DELETE r"
graph.query(query)
query = "MATCH ()-[r:HAS_DIFFERENCE]-() DELETE r"
graph.query(query)
query = "MATCH (n:DifferenceValue) DELETE n"
graph.query(query)
query = "MATCH (n:Difference) DELETE n"
graph.query(query)

# add difference value nodes
for i, (idx, row) in enumerate(df_differences.iterrows()):
    if i % 50 == 0:
        print(f"{i}/{len(df_differences)}")
    
    row['value'] = row['value'].replace("'", "\'")
    query = f"""
    MERGE (:DifferenceValue {{ value: "{row['value']}", degree: {row['sum']}, p_is_match: {row['p_is_match']:.3f}, p_no_match: {row['p_no_match']:.3f}, has_numbers: {row['has_numbers']*1}}} )
    """
    graph.query(query)
print(query)

# add differences as nodes
for i, (idx, row) in enumerate(df_results_merged.iterrows()):
    if i % 100 == 0:
        print(f"{i}/{len(df_results_merged)}")

    
    query = f"""
    MATCH (dv:DifferenceValue {{value: "{row['differences_str']}"}})
    MATCH (w1:WordLower {{value: "{row['w1']}"}})
    MATCH (w2:WordLower {{value: "{row['w2']}"}})
    MERGE (d:Difference {{ value: {row['differences']}, sorted_words: {sorted([row['w1'], row['w2']])} }})-[:HAS_DIFFERENCE_VALUE]->(dv)
    MERGE (w1)-[:HAS_DIFFERENCE]->(d)<-[:HAS_DIFFERENCE]-(w2)
    """
    
    graph.query(query)
print(query)

0/1295
50/1295
100/1295
150/1295
200/1295
250/1295
300/1295
350/1295
400/1295
450/1295
500/1295
550/1295
600/1295
650/1295
700/1295
750/1295
800/1295
850/1295
900/1295
950/1295
1000/1295
1050/1295
1100/1295
1150/1295
1200/1295
1250/1295

    MERGE (:DifferenceValue { value: "6;8", degree: 2.0, p_is_match: 0.000, p_no_match: 1.000, has_numbers: 1} )
    
0/3082
100/3082
200/3082
300/3082
400/3082
500/3082
600/3082
700/3082
800/3082
900/3082
1000/3082
1100/3082
1200/3082
1300/3082
1400/3082
1500/3082
1600/3082
1700/3082
1800/3082
1900/3082
2000/3082
2100/3082
2200/3082
2300/3082
2400/3082
2500/3082
2600/3082
2700/3082
2800/3082
2900/3082
3000/3082

    MATCH (dv:DifferenceValue {value: "-"})
    MATCH (w1:WordLower {value: "981000040"})
    MATCH (w2:WordLower {value: "981-000040"})
    MERGE (d:Difference { value: ['-'], sorted_words: ['981-000040', '981000040'] })-[:HAS_DIFFERENCE_VALUE]->(dv)
    MERGE (w1)-[:HAS_DIFFERENCE]->(d)<-[:HAS_DIFFERENCE]-(w2)
    
CPU times: user 5.32 s, sy

#### Set display value

In [64]:
query = """MATCH (dv:DifferenceValue) SET dv.display_value = dv.value + " (" + dv.p_is_match + ")" RETURN count(dv)"""
graph.query(query)

[{'count(dv)': 1295}]

### Examples

_Example where only differences between WordLower nodes are common separators, all of them are a match_
<img src="../images/9.1-common-differences.jpg?" width="600">

_Example where differences contain a numerical value, in this case none of them are a match_
<img src="../images/9.2-numerical-differences.jpg?" width="600">

_For some reason if `bk` is the only difference these products are often a match_
<img src="../images/9.3-unexpected-match.jpg?" width="600">

#### Include differences in model

Best model so far:

__Match rules:__
* 1. Items that share words (_lowercase_) with a name_degree < 6 `-> MATCH`
* 2. Items that share words with a name_degree >= 6 and name_degree < 47 
    * share an 6gram that is not a vocabulary word `-> MATCH`
* f-score so far: `0.822335`

In [65]:
df_results_all = pd.read_csv(output_path / 'results.csv')
df_results_all.head()

Unnamed: 0,model,threshold,prec,recall,fscore,evaluated_on,tp,fp,fn,tn
0,shared_word_lower_common_no_numerical_difference,6,0.91111,0.80921,0.85714,testset,123,12,589,29
1,shared_word_lower_common_difference,6,0.90511,0.81046,0.85517,testset,124,13,588,29
2,shared_word_lower_common_no_numerical_difference,6,0.88356,0.81646,0.84868,testset,129,17,584,29
3,shared_word_lower_threshold_6gram,"(6, 47)",0.80198,0.84375,0.822335,testset,162,40,30,574
4,shared_word_lower_threshold_5gram,"(6, 47)",0.727273,0.880383,0.796537,testset,184,69,25,559


In [66]:
threshold_low = 6

models = [
    {
        'model_name': 'shared_word_lower_common_difference',
        'threshold': threshold_low,
        'query': f"""
            MATCH p1=(i1:Item {{source: 'abt'}})-[:HAS_NAME]->(n1:Name)-[:HAS_WORD_LOWER]->(w:WordLower)<-[:HAS_WORD_LOWER]-(n2:Name)<-[:HAS_NAME]-(i2:Item {{source: 'buy'}})
            WHERE n1 <> n2
            AND w.name_degree < {threshold_low}
            RETURN i1.subject_id, i2.subject_id
            UNION ALL
            MATCH p1=(i1:Item {{source: 'abt'}})-[:HAS_NAME]-(n1:Name)-[:HAS_WORD_LOWER]->(w1:WordLower)-[:HAS_DIFFERENCE]->(d:Difference)<-[:HAS_DIFFERENCE]-(w2:WordLower)<-[:HAS_WORD_LOWER]-(n2:Name)<-[:HAS_NAME]-(i2:Item {{source: 'buy'}})
            WITH p1, d, i1, i2
            MATCH p2=(d)-[:HAS_DIFFERENCE_VALUE]-(dv:DifferenceValue)
            WHERE dv.p_is_match > 0.5
            RETURN i1.subject_id, i2.subject_id
            """
    },
    {
        'model_name': 'shared_word_lower_common_no_numerical_difference',
        'threshold': threshold_low,
        'query': f"""
            
            MATCH p1=(i1:Item {{source: 'abt'}})-[:HAS_NAME]->(n1:Name)-[:HAS_WORD_LOWER]->(w:WordLower)<-[:HAS_WORD_LOWER]-(n2:Name)<-[:HAS_NAME]-(i2:Item {{source: 'buy'}})
            WHERE n1 <> n2
            AND w.name_degree < {threshold_low}
            RETURN i1.subject_id, i2.subject_id
            UNION ALL
            MATCH p1=(i1:Item {{source: 'abt'}})-[:HAS_NAME]-(n1:Name)-[:HAS_WORD_LOWER]->(w1:WordLower)-[:HAS_DIFFERENCE]->(d:Difference)<-[:HAS_DIFFERENCE]-(w2:WordLower)<-[:HAS_WORD_LOWER]-(n2:Name)<-[:HAS_NAME]-(i2:Item {{source: 'buy'}})
            WITH p1, d, i1, i2
            MATCH p2=(d)-[:HAS_DIFFERENCE_VALUE]-(dv:DifferenceValue)
            WHERE dv.p_is_match > 0.5
            AND NOT dv.has_numbers = 1
            RETURN i1.subject_id, i2.subject_id
            """
    }
]
    
for model in models:
    print(model['model_name'])
    df_results_train, _ = get_model_results(
                      graph,
                      df_train, 
                      query=model['query'], 
                      model_name=model['model_name'], 
                      threshold=model['threshold'], 
                      evaluated_on="trainingset")

    df_results_test, _ = get_model_results(
                      graph,
                      df_test, 
                      query=model['query'], 
                      model_name=model['model_name'],
                      threshold=model['threshold'], 
                      evaluated_on="testset")

    display(df_results_train)
    display(df_results_test)
    df_results_all = pd.concat([df_results_train, df_results_all]).sort_values(['evaluated_on', 'fscore'], ascending=[True, False]).reset_index(drop=True).drop_duplicates()
    
    df_results_all = pd.concat([df_results_test, df_results_all]).sort_values(['evaluated_on','fscore'], ascending=[True, False]).reset_index(drop=True).drop_duplicates()

shared_word_lower_common_difference


Unnamed: 0,model,threshold,prec,recall,fscore,evaluated_on,tp,fp,fn,tn
0,shared_word_lower_common_difference,6,0.88358,0.96402,0.92205,trainingset,1313,173,4116,49


Unnamed: 0,model,threshold,prec,recall,fscore,evaluated_on,tp,fp,fn,tn
0,shared_word_lower_common_difference,6,0.88435,0.81761,0.84967,testset,130,17,584,29


shared_word_lower_common_no_numerical_difference


Unnamed: 0,model,threshold,prec,recall,fscore,evaluated_on,tp,fp,fn,tn
0,shared_word_lower_common_no_numerical_difference,6,0.87853,0.95326,0.91437,trainingset,1244,172,4116,61


Unnamed: 0,model,threshold,prec,recall,fscore,evaluated_on,tp,fp,fn,tn
0,shared_word_lower_common_no_numerical_difference,6,0.89583,0.81646,0.8543,testset,129,15,586,29


In [67]:
df_results_all = pd.concat([df_results_test, df_results_all]).sort_values(['evaluated_on','fscore'], ascending=[True, False]).reset_index(drop=True).drop_duplicates()
df_results_all.to_csv(output_path / "results.csv", index=False)

In [68]:
df_results_all

Unnamed: 0,model,threshold,prec,recall,fscore,evaluated_on,tp,fp,fn,tn
0,shared_word_lower_common_no_numerical_difference,6,0.91111,0.80921,0.85714,testset,123,12,589,29
1,shared_word_lower_common_difference,6,0.90511,0.81046,0.85517,testset,124,13,588,29
2,shared_word_lower_common_no_numerical_difference,6,0.89583,0.81646,0.8543,testset,129,15,586,29
4,shared_word_lower_common_difference,6,0.88435,0.81761,0.84967,testset,130,17,584,29
5,shared_word_lower_common_no_numerical_difference,6,0.88356,0.81646,0.84868,testset,129,17,584,29
6,shared_word_lower_threshold_6gram,"(6, 47)",0.80198,0.84375,0.822335,testset,162,40,30,574
7,shared_word_lower_threshold_5gram,"(6, 47)",0.727273,0.880383,0.796537,testset,184,69,25,559
8,shared_word_lower_common_difference,6,0.77381,0.81761,0.79511,testset,130,38,568,29
9,shared_word_lower_threshold,6,0.896226,0.673759,0.769231,testset,0,0,0,0
10,shared_word_lower_threshold_4gram,"(6, 47)",0.669782,0.903361,0.769231,testset,215,106,23,543


In [49]:
df_results_all.head()

Unnamed: 0,model,threshold,prec,recall,fscore,evaluated_on,tp,fp,fn,tn
0,shared_word_lower_common_no_numerical_difference,6,0.91111,0.80921,0.85714,testset,123,12,589,29
1,shared_word_lower_common_difference,6,0.90511,0.81046,0.85517,testset,124,13,588,29
2,shared_word_lower_common_no_numerical_difference,6,0.88356,0.81646,0.84868,testset,129,17,584,29
4,shared_word_lower_threshold_6gram,"(6, 47)",0.80198,0.84375,0.822335,testset,162,40,30,574
5,shared_word_lower_threshold_5gram,"(6, 47)",0.727273,0.880383,0.796537,testset,184,69,25,559


### Results

Model that includes matches items that share something specific (6gram) and have a common differences (derived from training set)

__Model 1: Common differences__
* 1. Items that share words (_lowercase_) with a name_degree < 6 `-> MATCH`
* 2. Items that share non-vocabulary 6grams which differ only with a value that is "common"
    * common: probablity of a match is > 50% for those values on the training set
* precision: `0.905 (+10.3%p)`
* recall: `0.810 (-3.33%p)`
* f-score: `0.855 (+3.28%p)`

__Model 2: Common differences + exclude numerical differences:__
* 1. Items that share words (_lowercase_) with a name_degree < 6 `-> MATCH`
* 2. Items that share non-vocabulary 6grams which differ only with a value that is "common"
    * common: probablity of a match is > 50% for those values on the training set
* precision: `0.911 (+10.9%p)`
* recall: `0.809 (-3.45%p)`
* f-score: `0.857 (+3.48%p)`

_Taking into account the differences is a big win for precision. This model loses a bit of recall, but the increase in precision gives a better fscore_

### Examples of False Positives

_1. Items mention the same product, but one of them is an add-on for that product_

* ABT: Panasonic 5.8GHz Add-On <b>Handset For</b> The KXTG6700B Phone System - Black Finish - KXTGA670B
* BUY: Panasonic KX-TG6700B Cordless Telephone
<img src="../images/10.1-FP-addon-same-product.jpg?" width="600">
<hr>

_2. Items share a 6gram, but it is in fact the manufacturer with a slightly different spelling_
* ABT: sennheis<b>s</b>er
* BUY: sennheiser

What is missed by the model is the crucial difference that the version of the model identifier is different
* ABT: Sennheisser Hi-Fi Wireless Headphone - <b>RS120</b>
* BUY: Sennheiser RS 130 Wireless Headphone - <b>RS130</b>

<br>
<img src="../images/10.2-FP-addon-same-product.jpg?" width="600">
<hr>

_3. Model learned that diffentiating value `bl` is common (80% training-set match), but there are exceptions_
* ABT: Yamaha <b>NS-AW390BL</b> All-Weather Pair Speaker System - NSAW390BK
* BUY: Yamaha Outdoor <b>NS-AW390</b> All-Weather Speaker System - NS-AW390WH

<img src="../images/10.3-FP-probablity-match.jpg?" width="600">
<img src="../images/10.4-FP-probablity-match-exceptions.jpg?" width="800">
<hr>


### Remaining errors - Test set

#### Focus on rule 2

In [216]:
query = f"""
    MATCH p1=(i1:Item {{source: 'abt'}})-[:HAS_NAME]-(n1:Name)-[:HAS_WORD_LOWER]->(w1:WordLower)-[:HAS_DIFFERENCE]->(d:Difference)<-[:HAS_DIFFERENCE]-(w2:WordLower)<-[:HAS_WORD_LOWER]-(n2:Name)<-[:HAS_NAME]-(i2:Item {{source: 'buy'}})
    WITH p1, d, i1, i2
    MATCH p2=(d)-[:HAS_DIFFERENCE_VALUE]-(dv:DifferenceValue)
    WHERE dv.p_is_match > 0.5
    RETURN i1.subject_id, i2.subject_id
"""

In [220]:
best_model = models[1]
df_results_train, dict_errors_train = get_model_results(df_train, 
                      query=query, 
                      model_name=best_model['model_name'],
                      threshold=best_model['threshold'], 
                      evaluated_on="trainset")

In [221]:
df_results_train

Unnamed: 0,model,threshold,prec,recall,fscore,evaluated_on,tp,fp,fn,tn
0,shared_word_lower_common_no_numerical_difference,6,0.95669,0.30995,0.46821,trainset,243,11,4235,541


#### Training errors

In [222]:
n = 20
for error_class in ['tp', 'fp', 'fp', 'fn']:
    df_errors_merged = dict_errors_train[f'df_{error_class}'].merge(df_abt, left_on='source_id', right_on='subject_id', how='left')
    df_errors_merged = df_errors_merged.merge(df_buy, left_on='target_id', right_on='subject_id', how='left')
    
    print(f"=============== {error_class} ===============")
    for i, (idx, row) in enumerate(df_errors_merged.head(n).iterrows()):
        print()
        print(f"{i}/{n}")
        print("Names:")
        print(f"- abt: {row['name_x']}")
        print(f"- buy: {row['name_y']}")
        print()
        print("Prices:")
        print(f"- abt: {row['price_x']}")
        print(f"- buy: {row['price_y']}")
        print()
        print("Descriptions:")
        print(f"- abt: {row['description_x']}")
        print(f"- buy: {row['description_y']}")
        
    print()


0/20
Names:
- abt: Panasonic KX-TGA820B Black DECT 6.0 Cordless Handset - KXTGA820B
- buy: Panasonic KX-TGA820B Cordless Handset

Prices:
- abt: nan
- buy: 39.76

Descriptions:
- abt: Panasonic KX-TGA820B Black DECT 6.0 Cordless Handset - KXTGA820B/ Compatible With DECT KX-TG8231/8232 Series Phone Systems/ DECT 6.0 Technology/ Answering System With Compatible Base Unit/ Call Waiting Caller ID/ Join In/Privacy/ Full Color Backlit LCD Display/ Black Finish
- buy: Black

1/20
Names:
- abt: Yamaha 7.2 Channel Black Digital Home Theater Receiver - RXV663BK
- buy: Yamaha RX-V663 Home Theater Receiver - RXV663BL

Prices:
- abt: 499.0
- buy: 499.0

Descriptions:
- abt: Yamaha 7.2 Channel Black Digital Home Theater Receiver - RXV663BK/ 4 SCENE Buttons/ XM Ready With XM HD Surround/ SIRIUS Satellite Radio Ready/ YPAO/ iPod Compatibility/ Bluetooth Compatibility/ Multi-Zone Control Compatibility/ On-Screen Display/ Black Finish
- buy: 665W - Dolby TrueHD, Dolby Digital EX, Dolby Pro Logic IIx, D