# Use SimSum Classification to Link People Data

In [1]:
import datetime
import itertools
import os
import pathlib
import re
import uuid

from typing import Tuple, Optional

import altair as alt
import numpy as np
import pandas as pd
import recordlinkage as rl
import jellyfish
import sklearn

## Define relevant filepaths

In [2]:
WORKING_DIR = pathlib.Path(os.path.abspath(''))

TRAINING_DATASET_A = WORKING_DIR / "training_a.csv"
TRAINING_DATASET_B = WORKING_DIR / "training_b.csv"
TRAINING_LABELS = WORKING_DIR / "training_labels.csv"

## Load training data (cleaned datasets)

In [3]:
df_A = pd.read_csv(TRAINING_DATASET_A)
df_A = df_A.set_index("person_id_A")
df_A.head()

Unnamed: 0_level_0,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id
person_id_A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
c4a4d5c9-4bcd-47a4-b621-595638b76717,dakota,geraghty,69,maclean street,skeers property,dandenong north,2529,nsw,19380417.0,31.0,03 01783133,6629995
ce79609d-33b4-423a-bdb4-6a6f0a1572a0,james,colquhoun,118,conlon crescent,,birkdale,5043,nsw,19680112.0,,07 14327140,5350518
77d0b5c5-0dc5-458f-9eff-af3384e1f86a,ruby,butt,103,,wollartukkee,east fremantle,4814,wa,19430120.0,30.0,02 88839517,3225206
ef952695-f044-4dc5-8d91-98692d0fb617,marcus,rees,5,charlick place,lindoran,ballarat,4216,nsw,,27.0,08 17239266,7355062
81534f3f-0a8d-4627-b815-4e30f3c44ffd,jassim,belperio,36,john russell circuit,,eastwood,3131,nsw,19460129.0,20.0,02 61510457,9190750


In [4]:
df_B = pd.read_csv(TRAINING_DATASET_B)
df_B = df_B.set_index("person_id_B")
df_B.head()

Unnamed: 0_level_0,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id
person_id_B,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
c954056b-a8a6-4e25-a7a5-2984bcbb874e,charlotte,leukg,301,domain street,locn 1699,alma bay,2710,vic,,29.0,07 05109263,6356142
306a9027-bdb1-4cf9-ac34-07fbd744d34a,callie,heerscgap,23,dudi lzce,,mill park,2324,tas,19820623.0,9.0,02 82637596,6775114
19b72493-6ebf-47d6-bbec-00eb1e343bde,alanx,nguyen,6,callaghan street,,albury,4575,nsw,19220115.0,27.0,08 82171717,5275665
95c7927d-a9d4-4add-8e31-641507891771,willjam,dud,83,purbrick street,glenveagh,muttabrra,6100,,19871212.0,23.0,07 54557966,7073899
8b60bfca-ba49-451d-a35f-8d4e6822c8b0,lucy,baillie,34,hurley street,,glen iqnnes,5038,sa,19310448.0,,08 19431835,6880723


## Load training data labels

In [5]:
df_labels = pd.read_csv(TRAINING_LABELS)
df_labels = df_labels.set_index(['person_id_A', 'person_id_B'])
df_labels.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,label
person_id_A,person_id_B,Unnamed: 2_level_1
65d84bcc-3422-4929-9351-585f47614979,1a084097-3004-484f-ad0e-b84b06a5bcb9,1
09eff19d-c6b1-455c-8162-8158c8beca0b,27eed78b-aaa3-401d-a701-763495873e1b,1
308bc53a-e7e7-4150-81de-b5094b63c196,99ffc3cf-53dd-410d-b2b3-5f18d4d31341,1
53ff2d35-a75f-4586-a5a6-1585c1c4dedd,722d1749-9bfb-43f8-9ab0-cd1f87f6ae16,1
674b7869-11c8-4106-8dd7-b892a668b993,fe16458e-fc6d-45b4-8071-cab0b7213d3a,1


## Data Augmentation

Here, we'll augment our people data with fields that we can use for blocking and comparing.

**Phonetic Blocking**

https://pypi.org/project/jellyfish/

In [6]:
def dob_to_date(dob: str) -> Optional[pd.Timestamp]:
    """ Transform string date in YYYYMMDD format to a pd.Timestamp.
        Return None if transformation is not successful.
    """
    date_pattern = r"(\d{4})(\d{2})(\d{2})"
    dob_timestamp = None
    
    try:
        if m := re.match(date_pattern, dob.strip()):
            dob_timestamp = pd.Timestamp(int(m.group(1)), int(m.group(2)), int(m.group(3)))
    except:
        pass

    return dob_timestamp

In [7]:
%%time

for df in [df_A, df_B]:
    
    # Update NaNs to empty strings or jellyfish will choke.
    df["surname"] = df["surname"].fillna("")
    df["first_name"] = df["first_name"].fillna("")

    # Soundex phonetic encodings.
    df["soundex_surname"] = df["surname"].apply(lambda x: jellyfish.soundex(x))
    df["soundex_firstname"] = df["first_name"].apply(lambda x: jellyfish.soundex(x))
    
    # NYSIIS phonetic encodings.    
    df["nysiis_surname"] = df["surname"].apply(lambda x: jellyfish.nysiis(x))
    df["nysiis_firstname"] = df["first_name"].apply(lambda x: jellyfish.nysiis(x))
    
    # Last 3 of SSID.
    df["ssid_last3"] = df["soc_sec_id"].apply(lambda x: str(x)[-3:].zfill(3) if x else None)
    df["soc_sec_id"] = df["soc_sec_id"].astype(str)
    
    # DOB to date object.
    df["dob"] = df["date_of_birth"].apply(lambda x: dob_to_date(x))


CPU times: user 87.9 ms, sys: 4.44 ms, total: 92.4 ms
Wall time: 90.4 ms


Let's take a look at a sample of our new columns.

In [8]:
df_A.head()

Unnamed: 0_level_0,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id,soundex_surname,soundex_firstname,nysiis_surname,nysiis_firstname,ssid_last3,dob
person_id_A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
c4a4d5c9-4bcd-47a4-b621-595638b76717,dakota,geraghty,69,maclean street,skeers property,dandenong north,2529,nsw,19380417.0,31.0,03 01783133,6629995,G623,D230,GARAGTY,DACAT,995,1938-04-17
ce79609d-33b4-423a-bdb4-6a6f0a1572a0,james,colquhoun,118,conlon crescent,,birkdale,5043,nsw,19680112.0,,07 14327140,5350518,C425,J520,CALGAHAN,JAN,518,1968-01-12
77d0b5c5-0dc5-458f-9eff-af3384e1f86a,ruby,butt,103,,wollartukkee,east fremantle,4814,wa,19430120.0,30.0,02 88839517,3225206,B300,R100,BAT,RABY,206,1943-01-20
ef952695-f044-4dc5-8d91-98692d0fb617,marcus,rees,5,charlick place,lindoran,ballarat,4216,nsw,,27.0,08 17239266,7355062,R200,M622,R,MARC,62,NaT
81534f3f-0a8d-4627-b815-4e30f3c44ffd,jassim,belperio,36,john russell circuit,,eastwood,3131,nsw,19460129.0,20.0,02 61510457,9190750,B416,J250,BALPAR,JASAN,750,1946-01-29


## Blocking

In [9]:
# Look and see how many pairs we would need to process with a full (cartesian join) blocker.

indexer = rl.Index()
indexer.add(rl.index.Full())

candidate_links = indexer.index(df_A, df_B)
full_blocker_pairs = candidate_links.shape[0]

print(f"{full_blocker_pairs:,} total pairs.")

25,000,000 total pairs.


In [10]:
indexer = rl.Index()

indexer.add(rl.index.Block("soundex_surname"))
indexer.add(rl.index.Block("soundex_firstname"))
indexer.add(rl.index.Block("nysiis_surname"))
indexer.add(rl.index.Block("nysiis_firstname"))
indexer.add(rl.index.Block("ssid_last3"))
indexer.add(rl.index.Block("date_of_birth"))

candidate_links = indexer.index(df_A, df_B)
blocked_pairs = candidate_links.shape[0]

search_space_reduction = round((1 - (blocked_pairs/full_blocker_pairs)) * 100, 2)

print(f"{blocked_pairs:,} pairs after blocking: {search_space_reduction}% search space reduction.")

653,588 pairs after blocking: 97.39% search space reduction.


In [11]:
# Show what candidate links look like.
# candidate_links

## Comparing

In [12]:
%%time

comparer = rl.Compare()

# Phonetic encodings.
comparer.add(rl.compare.Exact("soundex_surname", "soundex_surname", label="soundex_surname"))
comparer.add(rl.compare.Exact("soundex_firstname", "soundex_firstname", label="soundex_firstname"))
comparer.add(rl.compare.Exact("nysiis_surname", "nysiis_surname", label="nysiis_surname"))
comparer.add(rl.compare.Exact("nysiis_firstname", "nysiis_firstname", label="nysiis_firstname"))

# First & last name.
comparer.add(rl.compare.String("surname", "surname", method="jarowinkler", label="last_name"))
comparer.add(rl.compare.String("first_name", "first_name", method="jarowinkler", label="first_name"))

# Address.
comparer.add(rl.compare.String("address_1", "address_1", method="damerau_levenshtein", label="address_1"))
comparer.add(rl.compare.String("address_2", "address_2", method="damerau_levenshtein", label="address_2"))
comparer.add(rl.compare.String("suburb", "suburb", method="damerau_levenshtein", label="suburb"))
comparer.add(rl.compare.String("postcode", "postcode", method="damerau_levenshtein", label="postcode"))
comparer.add(rl.compare.String("state", "state", method="damerau_levenshtein", label="state"))

# Birthday.
comparer.add(rl.compare.Date("dob", "dob", label="date_of_birth"))

comparer.add(rl.compare.String("phone_number", "phone_number", method="damerau_levenshtein", label="phone_number"))
# comparer.add(rl.compare.String("soc_sec_id", "soc_sec_id", method="damerau_levenshtein", label="ssn"))

features = comparer.compute(candidate_links, df_A, df_B)

CPU times: user 37.5 s, sys: 833 ms, total: 38.4 s
Wall time: 38 s


In [13]:
features

Unnamed: 0_level_0,Unnamed: 1_level_0,soundex_surname,soundex_firstname,nysiis_surname,nysiis_firstname,last_name,first_name,address_1,address_2,suburb,postcode,state,date_of_birth,phone_number
person_id_A,person_id_B,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0010de7c-d39b-49d1-a839-bd36a2d6efff,11324dc4-babe-46ca-a9ed-481aa3340b82,0,1,0,1,0.416667,1.000000,0.142857,0.066667,0.083333,0.4,0.25,0.0,0.250000
0010de7c-d39b-49d1-a839-bd36a2d6efff,1c04187f-ee0e-4fc5-ae8e-0efa9f73e610,1,0,1,0,1.000000,0.000000,0.200000,0.066667,0.400000,0.4,0.25,0.0,0.416667
0010de7c-d39b-49d1-a839-bd36a2d6efff,1c4b4393-f425-4b49-86d0-cfefe7e1a50a,1,0,0,0,0.775000,0.000000,0.222222,0.066667,0.285714,0.2,1.00,0.0,0.333333
0010de7c-d39b-49d1-a839-bd36a2d6efff,1d171480-573c-43f2-abc3-f7c8c380e8e5,1,0,0,0,0.800000,0.511111,0.210526,0.066667,0.187500,0.4,0.25,0.0,0.416667
0010de7c-d39b-49d1-a839-bd36a2d6efff,1e25b221-1b85-4f84-8545-302183cb779d,1,0,1,0,1.000000,0.481481,0.166667,0.066667,0.090909,0.4,0.25,0.0,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffffe74c-77cf-4ad7-9f46-a9f5f25e19e1,e0de25d8-ad78-4995-89ef-810674ce839d,1,0,1,0,1.000000,0.483333,0.533333,1.000000,0.125000,0.2,0.25,0.0,0.250000
ffffe74c-77cf-4ad7-9f46-a9f5f25e19e1,e9ac45c0-acb1-4c72-b763-1b043635b56d,0,1,0,0,0.444444,0.836667,0.250000,1.000000,0.166667,0.2,1.00,0.0,0.416667
ffffe74c-77cf-4ad7-9f46-a9f5f25e19e1,ed22ceda-c036-4489-b5ee-a0a902371790,1,0,1,0,1.000000,0.483333,0.571429,1.000000,0.250000,0.2,1.00,0.0,0.500000
ffffe74c-77cf-4ad7-9f46-a9f5f25e19e1,f083eb03-bfc2-4635-9ba8-c23098c847ac,1,0,1,0,0.765000,0.483333,0.230769,1.000000,0.111111,0.2,0.25,0.0,0.500000


In [14]:
display(features.iloc[0].name)
display(features.iloc[0])

('0010de7c-d39b-49d1-a839-bd36a2d6efff',
 '11324dc4-babe-46ca-a9ed-481aa3340b82')

soundex_surname      0.000000
soundex_firstname    1.000000
nysiis_surname       0.000000
nysiis_firstname     1.000000
last_name            0.416667
first_name           1.000000
address_1            0.142857
address_2            0.066667
suburb               0.083333
postcode             0.400000
state                0.250000
date_of_birth        0.000000
phone_number         0.250000
Name: (0010de7c-d39b-49d1-a839-bd36a2d6efff, 11324dc4-babe-46ca-a9ed-481aa3340b82), dtype: float64

## Add labels to feature vectors

In [15]:
df_labeled_features = pd.merge(
    features,
    df_labels,
    on=['person_id_A', 'person_id_B'],
    how="left"
)

df_labeled_features["label"].fillna(0, inplace=True)
df_labeled_features

Unnamed: 0_level_0,Unnamed: 1_level_0,soundex_surname,soundex_firstname,nysiis_surname,nysiis_firstname,last_name,first_name,address_1,address_2,suburb,postcode,state,date_of_birth,phone_number,label
person_id_A,person_id_B,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0010de7c-d39b-49d1-a839-bd36a2d6efff,11324dc4-babe-46ca-a9ed-481aa3340b82,0,1,0,1,0.416667,1.000000,0.142857,0.066667,0.083333,0.4,0.25,0.0,0.250000,0.0
0010de7c-d39b-49d1-a839-bd36a2d6efff,1c04187f-ee0e-4fc5-ae8e-0efa9f73e610,1,0,1,0,1.000000,0.000000,0.200000,0.066667,0.400000,0.4,0.25,0.0,0.416667,0.0
0010de7c-d39b-49d1-a839-bd36a2d6efff,1c4b4393-f425-4b49-86d0-cfefe7e1a50a,1,0,0,0,0.775000,0.000000,0.222222,0.066667,0.285714,0.2,1.00,0.0,0.333333,0.0
0010de7c-d39b-49d1-a839-bd36a2d6efff,1d171480-573c-43f2-abc3-f7c8c380e8e5,1,0,0,0,0.800000,0.511111,0.210526,0.066667,0.187500,0.4,0.25,0.0,0.416667,0.0
0010de7c-d39b-49d1-a839-bd36a2d6efff,1e25b221-1b85-4f84-8545-302183cb779d,1,0,1,0,1.000000,0.481481,0.166667,0.066667,0.090909,0.4,0.25,0.0,0.333333,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffffe74c-77cf-4ad7-9f46-a9f5f25e19e1,e0de25d8-ad78-4995-89ef-810674ce839d,1,0,1,0,1.000000,0.483333,0.533333,1.000000,0.125000,0.2,0.25,0.0,0.250000,0.0
ffffe74c-77cf-4ad7-9f46-a9f5f25e19e1,e9ac45c0-acb1-4c72-b763-1b043635b56d,0,1,0,0,0.444444,0.836667,0.250000,1.000000,0.166667,0.2,1.00,0.0,0.416667,0.0
ffffe74c-77cf-4ad7-9f46-a9f5f25e19e1,ed22ceda-c036-4489-b5ee-a0a902371790,1,0,1,0,1.000000,0.483333,0.571429,1.000000,0.250000,0.2,1.00,0.0,0.500000,0.0
ffffe74c-77cf-4ad7-9f46-a9f5f25e19e1,f083eb03-bfc2-4635-9ba8-c23098c847ac,1,0,1,0,0.765000,0.483333,0.230769,1.000000,0.111111,0.2,0.25,0.0,0.500000,0.0


## sim sum

In [16]:
df_labeled_features["simsum"] = df_labeled_features.drop("label", axis=1).sum(axis=1)
df_labeled_features

Unnamed: 0_level_0,Unnamed: 1_level_0,soundex_surname,soundex_firstname,nysiis_surname,nysiis_firstname,last_name,first_name,address_1,address_2,suburb,postcode,state,date_of_birth,phone_number,label,simsum
person_id_A,person_id_B,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0010de7c-d39b-49d1-a839-bd36a2d6efff,11324dc4-babe-46ca-a9ed-481aa3340b82,0,1,0,1,0.416667,1.000000,0.142857,0.066667,0.083333,0.4,0.25,0.0,0.250000,0.0,4.609524
0010de7c-d39b-49d1-a839-bd36a2d6efff,1c04187f-ee0e-4fc5-ae8e-0efa9f73e610,1,0,1,0,1.000000,0.000000,0.200000,0.066667,0.400000,0.4,0.25,0.0,0.416667,0.0,4.733333
0010de7c-d39b-49d1-a839-bd36a2d6efff,1c4b4393-f425-4b49-86d0-cfefe7e1a50a,1,0,0,0,0.775000,0.000000,0.222222,0.066667,0.285714,0.2,1.00,0.0,0.333333,0.0,3.882937
0010de7c-d39b-49d1-a839-bd36a2d6efff,1d171480-573c-43f2-abc3-f7c8c380e8e5,1,0,0,0,0.800000,0.511111,0.210526,0.066667,0.187500,0.4,0.25,0.0,0.416667,0.0,3.842471
0010de7c-d39b-49d1-a839-bd36a2d6efff,1e25b221-1b85-4f84-8545-302183cb779d,1,0,1,0,1.000000,0.481481,0.166667,0.066667,0.090909,0.4,0.25,0.0,0.333333,0.0,4.789057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffffe74c-77cf-4ad7-9f46-a9f5f25e19e1,e0de25d8-ad78-4995-89ef-810674ce839d,1,0,1,0,1.000000,0.483333,0.533333,1.000000,0.125000,0.2,0.25,0.0,0.250000,0.0,5.841667
ffffe74c-77cf-4ad7-9f46-a9f5f25e19e1,e9ac45c0-acb1-4c72-b763-1b043635b56d,0,1,0,0,0.444444,0.836667,0.250000,1.000000,0.166667,0.2,1.00,0.0,0.416667,0.0,5.314444
ffffe74c-77cf-4ad7-9f46-a9f5f25e19e1,ed22ceda-c036-4489-b5ee-a0a902371790,1,0,1,0,1.000000,0.483333,0.571429,1.000000,0.250000,0.2,1.00,0.0,0.500000,0.0,7.004762
ffffe74c-77cf-4ad7-9f46-a9f5f25e19e1,f083eb03-bfc2-4635-9ba8-c23098c847ac,1,0,1,0,0.765000,0.483333,0.230769,1.000000,0.111111,0.2,0.25,0.0,0.500000,0.0,5.540214


In [17]:
df_labeled_features.shape

(653588, 15)

In [18]:
df_sim_sum_dist = df_labeled_features[["simsum", "label"]].copy()
df_sim_sum_dist["label"] = df_sim_sum_dist["label"].apply(lambda x: "True Link" if x == 1 else "Not a Link")
df_sim_sum_dist["simsum"] = df_sim_sum_dist["simsum"].apply(lambda x: round(x, 2))
df_sim_sum_dist["count"] = df_sim_sum_dist["label"]
df_sim_sum_dist = df_sim_sum_dist.groupby(["simsum", "label"]).count().reset_index()
df_sim_sum_dist

Unnamed: 0,simsum,label,count
0,0.71,Not a Link,1
1,0.74,Not a Link,2
2,0.76,Not a Link,1
3,0.80,Not a Link,1
4,0.81,Not a Link,2
...,...,...,...
1393,12.95,True Link,8
1394,12.96,True Link,2
1395,12.97,True Link,6
1396,12.98,True Link,1


In [19]:
min(df_sim_sum_dist["simsum"])

0.71

In [20]:
sorted(list(df_sim_sum_dist["label"].unique()))

['Not a Link', 'True Link']

In [21]:
legend_selection = alt.selection_multi(fields=["label"], bind="legend")

color_scale = alt.Scale(
    domain=["True Link", "Not a Link"],
    scheme="tableau10",
)

alt.Chart(df_sim_sum_dist, title=f"SimSum Score Distribution").mark_bar(opacity=0.7, binSpacing=0).encode(
    alt.X(
        "simsum:Q",
        bin=alt.Bin(extent=[0, max(df_sim_sum_dist["simsum"])], step=0.01),
        axis=alt.Axis(tickCount=5, title="SimSum Score (Binned)"),
    ),
    alt.Y("count", stack=None, axis=alt.Axis(title="Count of Links")),
    alt.Color(
        "label",
        scale=color_scale,
        legend=alt.Legend(title="Ground Truth Label"),
    ),
    opacity=alt.condition(legend_selection, alt.value(0.7), alt.value(0.2)),
    tooltip=[
        alt.Tooltip("simsum", title="SimSum Score"),
        alt.Tooltip("label", title="Ground Truth"),
        alt.Tooltip("count", title="Count of Links"),
    ],
).properties(
    height=200, width=800
).add_selection(legend_selection).interactive()

In [38]:
def evaluate_linking(
    df: pd.DataFrame,
    df_left: pd.DataFrame,
    df_right: pd.DataFrame,
    df_true_links: pd.DataFrame,
    score_column_name: str = "score",
    ground_truth_column_name: str = "ground_truth",
    k: int = 10
):
    """ Calculate precision & recall for model results,
    
        Args:
            df: Dataframe containing model scores, and ground truth labels
                indexed on 
            
            df_left: indexed on
            df_right:
            df_ground_truth:
            
    df needs to have df_A id, df_B id, score, ground truth label
        true_links: pandas MultiIndex of true links
    """

    
    # show dist
    
    # how many true links were found by blocking?
    # -> compare true links ids to ids of df
    
    # then display some graphs

    total_true_links = df_true_links.shape[0]
    true_links_after_blocking = pd.merge(
        df_true_links,
        df,
        left_index=True,
        right_index=True,
        how="inner"
    ).shape[0]
    
    true_link_pct_after_blocking = round((true_links_after_blocking / total_true_links)*100, 0)
    
    # True Links present in df.
    print(f"{true_link_pct_after_blocking}% true links present after blocking. ({true_links_after_blocking}/{total_true_links})")
    
    eval_data = []
    
    # Calculate true positives (tp), false positives (fp), true negatives (tn), false negatives (fn)
    # at threshold intervals from zero to max score.
    max_score = max(1, max(df[score_column_name]))

    for threshold in np.linspace(0, max_score, 50):
        tp = df[(df[score_column_name] >= threshold) & (df[ground_truth_column_name] == 1)].shape[0]
        fp = df[(df[score_column_name] >= threshold) & (df[ground_truth_column_name] == 0)].shape[0]
        tn = df[(df[score_column_name] < threshold) & (df[ground_truth_column_name] == 0)].shape[0]
        fn = df[(df[score_column_name] < threshold) & (df[ground_truth_column_name] == 1)].shape[0]
        
        eval_data.append(
            {
                "threshold" : threshold,
                "tp" : tp,
                "fp" : fp,
                "tn" : tn,
                "fn" : fn,
                "recall" : tp / (tp + fn),
                "precision" : tp / (tp + fp)
            }
        )

    
    def join_original_entity_data_to_links(df_k_links: pd.DataFrame, df_left, df_right) -> pd.DataFrame:
        """Helper function to join entity data to a datafram of link results."""
        
        # Join data from left entities.
        df_k_links = pd.merge(
            df_k_links,
            df_left,
            left_on=df_left.index.name,
            right_index=True,
        )
        
        # Join data from right entities.
        return pd.merge(
            df_k_links,
            df_right,
            left_on=df_right.index.name,
            right_index=True,
        )  
        

    df_top_k_links = join_original_entity_data_to_links(
        df[[score_column_name, ground_truth_column_name]].sort_values(score_column_name, ascending=False).head(n=k).reset_index(),
        df_left,
        df_right
    )
    
    df_bottom_k_links = join_original_entity_data_to_links(
        df[[score_column_name, ground_truth_column_name]].sort_values(score_column_name).head(n=k).reset_index(),
        df_left,
        df_right    
    )
    
    return pd.DataFrame(eval_data), df_top_k_links, df_bottom_k_links

df_eval, df_top_links, df_bottom_links = evaluate_linking(
    df=df_labeled_features,
    df_left=df_A,
    df_right=df_B,
    df_true_links=df_labels,
    score_column_name = "simsum",
    ground_truth_column_name = "label",  
)

100.0% true links present after blocking. (5000/5000)


In [39]:
display_cols = [
    'first_name', 'surname', 'street_number', 'address_1',
    'address_2', 'suburb', 'postcode', 'state', 'date_of_birth', 'age',
    'phone_number', 'soc_sec_id',
    "soundex_surname", "soundex_firstname",
    "nysiis_surname", "nysiis_firstname",
]

display_cols = [[f"{col}_x", f"{col}_y"] for col in display_cols]
display_cols = list(itertools.chain.from_iterable(display_cols))

In [40]:
with pd.option_context('display.max_columns', None):
    display(df_top_links[["person_id_A", "person_id_B", "simsum", "label"] + display_cols])

Unnamed: 0,person_id_A,person_id_B,simsum,label,first_name_x,first_name_y,surname_x,surname_y,street_number_x,street_number_y,address_1_x,address_1_y,address_2_x,address_2_y,suburb_x,suburb_y,postcode_x,postcode_y,state_x,state_y,date_of_birth_x,date_of_birth_y,age_x,age_y,phone_number_x,phone_number_y,soc_sec_id_x,soc_sec_id_y,soundex_surname_x,soundex_surname_y,soundex_firstname_x,soundex_firstname_y,nysiis_surname_x,nysiis_surname_y,nysiis_firstname_x,nysiis_firstname_y
0,68f45d75-f465-4fa3-928f-b93dff130bea,9a791f7e-66f6-4549-a436-8c75fd220404,13.0,1.0,finley,finley,goode,goode,8,4,blair street,blair street,phillip island,phillip island,kirra,kirra,4740,4740,,,19880919,19880919,35.0,,03 60273146,03 60273146,6292183,6292183,G300,G300,F540,F540,GAD,GAD,FANLY,FANLY
1,e2ef959d-c120-457d-afcf-828dc9c8c6e6,b1ab8b19-4cb3-45ae-93a0-61295e40b863,13.0,1.0,shandril,shandril,hedaux,hedaux,178,183,dalabon crescent,dalabon crescent,,,waterloo,waterloo,4870,4870,vic,vic,19140226,19140226,,,03 65551765,03 65551765,8585450,8855540,H320,H320,S536,S536,HADAX,HADAX,SANDRAL,SANDRAL
2,04f75c53-4241-4a13-9aca-22a8826146cb,ba3e89f2-db68-4091-8234-5d63bd7840d3,13.0,1.0,madeleine,madeleine,baillie,baillie,41,42,girrahween street,girrahween street,,,auburn,auburn,4507,4507,qld,qld,19440828,19440828,36.0,35.0,,,6493576,6493576,B400,B400,M345,M345,BALY,BALY,MADALAN,MADALAN
3,1605efd6-9d8a-4ccb-b0ab-d34606ae8e9c,a0bcb950-9de7-4fbd-8e13-6c1e0ee23713,13.0,1.0,olivia,olivia,hassall,hassall,2,1,paech place,paech place,,,deepwater,deepwater,2283,2283,nsw,nsw,19790514,19790514,24.0,,02 58606717,02 58606717,1599081,1599081,H240,H240,O410,O410,HASAL,HASAL,OLAV,OLAV
4,8b0c5487-80e6-48dc-9ee8-9a8d1dc5860d,e0e9ccf2-8f14-4815-8322-d857602e1cb6,13.0,1.0,riley,riley,agriogiannis,agriogiannis,25,29,mault place,mault place,spring creek,spring creek,lakes entrance,lakes entrance,2528,2528,nsw,nsw,19230712,19230712,31.0,31.0,03 83913275,03 83913275,6357155,6357250,A262,A262,R400,R400,AGRAGAN,AGRAGAN,RALY,RALY
5,ff4c5e55-8ddb-4f13-a9ee-82cd62470f06,105e6b21-a268-4cf0-93a9-659d97b5652f,13.0,1.0,joshua,joshua,matthews,matthews,7,32,watts street,watts street,,,southbank,southbank,3340,3340,nsw,nsw,19521122,19521122,28.0,,07 89548454,07 89548454,3331727,3331727,M320,M320,J200,J200,MATAE,MATAE,JAS,JAS
6,75e3f2ee-f7e1-441c-9443-b8232ab24ade,7cbf3856-f4c0-4e5b-8837-482133cf919e,13.0,1.0,bayden,bayden,lock,lock,22,224,bargang crescent,bargang crescent,,,hamilton north,hamilton north,3806,3806,nsw,nsw,19970727,19970727,27.0,27.0,02 41692638,02 41692638,7664859,7674857,L200,L200,B350,B350,LAC,LAC,BAYDAN,BAYDAN
7,efcf050f-b2fb-4395-b019-81a07da4d5f5,a95db2e8-be63-4a31-9ee6-0ade8432393f,13.0,1.0,david,david,skeen,skeen,79,70,undoolya street,undoolya street,,,spearwood,spearwood,6084,6084,sa,sa,19810825,19810825,30.0,39.0,03 92722295,03 92722295,8733476,9478220,S500,S500,D130,D130,SCAN,SCAN,DAVAD,DAVAD
8,843fcc60-53c7-469e-868f-0b81cbb4d7f3,1189f232-ffa9-47e9-85d7-ad520aaca05d,13.0,1.0,jayden,jayden,cicchini,cicchini,35,34,croton street,croton street,clarkwood,clarkwood,keon park,keon park,2304,2304,nsw,nsw,19030705,19030705,22.0,22.0,02 17834351,02 17834351,3530795,3530695,C250,C250,J350,J350,CACAN,CACAN,JAYDAN,JAYDAN
9,8c3bf11c-d94f-4062-9475-deecfd7c0280,bd99232a-8816-4386-b141-59bcc2df11fa,13.0,1.0,laura,laura,clarke,clarke,10,46,walker crescent,walker crescent,toolebewong farm,toolebewong farm,whalan,whalan,4077,4077,nsw,nsw,19690425,19690425,27.0,27.0,02 61144666,02 61144666,6575605,8423762,C462,C462,L600,L600,CLARC,CLARC,LAR,LAR


In [41]:
with pd.option_context('display.max_columns', None):
    display(df_bottom_links[["person_id_A", "person_id_B", "simsum", "label"] + display_cols])

Unnamed: 0,person_id_A,person_id_B,simsum,label,first_name_x,first_name_y,surname_x,surname_y,street_number_x,street_number_y,address_1_x,address_1_y,address_2_x,address_2_y,suburb_x,suburb_y,postcode_x,postcode_y,state_x,state_y,date_of_birth_x,date_of_birth_y,age_x,age_y,phone_number_x,phone_number_y,soc_sec_id_x,soc_sec_id_y,soundex_surname_x,soundex_surname_y,soundex_firstname_x,soundex_firstname_y,nysiis_surname_x,nysiis_surname_y,nysiis_firstname_x,nysiis_firstname_y
0,3239e9e3-68fa-441f-be59-ab55559fbbe0,19b86002-e44d-4cc5-a3dd-ea5b96fc102c,0.714534,0.0,nicholas,jayb,rees,humphfcys,34,32,,higgerson street,,windsor dental centre,mitcham,balwyn north,2190,4802,nsw,wa,,,31.0,21.0,03 48152407,,8000601,3725158,R200,H512,N242,J100,R,HANFCY,NACAL,JAYB
1,e3fe1cb9-f071-44f9-b39b-e651883fc0f3,f4b242b5-4815-4f4b-ab6c-a2dd0a7abded,0.738095,0.0,declen,jaykob,kiss,mccarthy,33,37,flecker place,angasostreet,,leisure living vlge,bonny hills,greenwood,5025,3198,qld,nsj,,,25.0,11.0,,03 58955121,9377051,3700368,K200,M263,D245,J210,C,MCARTY,DACLAN,JAYCAB
2,b182f414-2bce-44bf-a005-8fb9d94fa830,fbeef14d-f6f4-4cac-bca5-9483116ae796,0.740559,0.0,gabriel,joxhua,filipov,prodw,56,173,bavin street,,dudley specialist medical centre,,elwood,mona vale,7008,2672,nsw,,,,37.0,29.0,07 44471940,,6353487,1978244,F411,P630,G164,J200,FALAPAV,PRADW,GABRAL,JAX
4,d78b75b6-6256-43af-abfb-46d1760b1307,fbeef14d-f6f4-4cac-bca5-9483116ae796,0.795671,0.0,katelyn,joxhua,blinman,prodw,21,173,oliver street,,ryhd-talog,,berwick,mona vale,6149,2672,vic,,,,33.0,29.0,04 39282098,,1720350,1978244,B455,P630,K345,J200,BLANAN,PRADW,CATALYN,JAX
3,5957336d-313d-450b-94ee-23cd00cddda6,3ce04bbd-8e8d-4793-83ce-b919bd278d21,0.7625,0.0,aidan,chloe,medved,wottro,8,93,woodfull loop,perrin cidcuit,,st francis vlge,batchelor,,3311,2749,vic,,,,,32.0,08 74324158,,3657799,6611480,M313,W360,A350,C400,MADVAD,WATR,ADAN,CL
5,b95dff15-8917-43dd-b082-26d508cdf0d5,3ce04bbd-8e8d-4793-83ce-b919bd278d21,0.80609,0.0,luka,chloe,binns,wottro,16,93,maxwell street,perrin cidcuit,,st francis vlge,mackay north,,4116,2749,vic,,,,13.0,32.0,04 46965459,,9966182,6611480,B520,W360,L200,C400,BAN,WATR,LAC,CL
6,90a9f00d-4e75-4c73-bcf4-d6ddc2044d06,0bdcadb4-acda-44ac-95ef-0592799c3a92,0.814815,0.0,ruby,tarp,rafanelli,sedorkw,35,418,elliott street,roughley place,,villa 444 the village glen,ballarat,eden,4511,3930,qld,vic,,,21.0,,,03 96637595,6348843,4730440,R154,S362,R100,T610,RAFANAL,SADARCW,RABY,TARP
7,3880f1ca-61b6-4ada-9500-dd72553bc022,45698b01-c5db-401b-a701-32afb496a9ed,0.833333,0.0,kieren,samujo,berry,simmonds,50,2,pinterry place,ivo whittov circuit,gundaline,,coombabah,wakeley,3140,2281,wa,nqsw,,,28.0,24.0,03 93500859,,2768684,3868638,B600,S553,K650,S520,BARY,SANAND,CARAN,SANAJ
8,23853abd-c407-4131-a1ae-06f6d199f0d8,1e2e3d92-086e-4d3c-a755-9a205ddca287,0.834499,0.0,mia,,winfield,roch,8,9,,bruch road,middle earth,,lindenow south,rochrster,4210,3183,qld,vic,19600514.0,19870908.0,,36.0,,04 18315295,6925345,3456345,W514,R200,M000,,WANFALD,RAC,M,
9,0dca3a34-c7b8-4add-a8d6-a8d1e2bf7159,60dd9817-0833-4785-a95d-1aa82086122a,0.842857,0.0,hollie,zac,woodbury,canini,86,1716,,whalan lace,,oxford,south perth,terreyhills,3143,4270,wa,nsw,,,37.0,32.0,,04 40897322,8392168,9700884,W316,C550,H400,Z200,WADBARY,CANAN,HALY,ZAC


In [42]:
df_eval.head()

Unnamed: 0,threshold,tp,fp,tn,fn,recall,precision
0,0.0,5000,648588,0,0,1.0,0.00765
1,0.265306,5000,648588,0,0,1.0,0.00765
2,0.530612,5000,648588,0,0,1.0,0.00765
3,0.795918,5000,648583,5,0,1.0,0.00765
4,1.061224,5000,648360,228,0,1.0,0.007653


In [27]:
model_legend_select = alt.selection_multi(fields=["variable"], bind="legend")

alt.Chart(
    df_eval[["threshold", "recall", "precision"]].melt(id_vars=["threshold"]),
    title="Precision and Recall v.s. Model Threshold"
).mark_line().encode(
    alt.X("threshold:Q", axis=alt.Axis(title="Model Threshold")),
    alt.Y(
        "value:Q",
        scale=alt.Scale(domain=(0, 1)),
        axis=alt.Axis(title="Precision/Recall Value"),
    ),
    alt.Color(
        "variable:N", legend=alt.Legend(title="Variable")
    ),
    tooltip=alt.Tooltip(["variable", "threshold", "value"]),
).add_selection(
    model_legend_select
).properties(height=400, width=800)