# Train a NeuralNet Classifier to Link FEBRL People Data

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/rachhouse/intro-to-data-linking/blob/linking-work/tutorial_notebooks/02_Link_FEBRL_Data_with_NN_Classifier.ipynb)

In this tutorial, we'll train a neural net classifier to score candidate pairs for linking, using supervised learning. We will use the same training dataset as the SimSum classification tutorial, as well as the same augmentation, blocking, and comparing functions. The functions have been included in a separate `.py` file for re-use and convenience, so we can focus on code unique to this tutorial.

In [1]:
import itertools

import altair as alt
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split

In [2]:
# Grab the linking functions file from github and save locally for Colab.
# We'll import our previously used linking functions from this file.
import linking_tutorial_functions as tutorial

## Load Training Data and Ground Truth Labels

In [3]:
df_A, df_B, df_ground_truth = tutorial.load_febrl_training_data()

## Data Augmentation

In [4]:
%%time

for df in [df_A, df_B]:
    df = tutorial.augment_data(df)

CPU times: user 82.8 ms, sys: 3.38 ms, total: 86.1 ms
Wall time: 84.9 ms


## Blocking

In [5]:
full_blocker_pairs = df_A.shape[0] * df_B.shape[0]

candidate_links = tutorial.block(df_A, df_B)
blocked_pairs = candidate_links.shape[0]

search_space_reduction = round((1 - (blocked_pairs/full_blocker_pairs)) * 100, 2)

print(f"{blocked_pairs:,} pairs after blocking: {search_space_reduction}% search space reduction.")

653,588 pairs after blocking: 97.39% search space reduction.


## Comparing

In [6]:
%%time

features = tutorial.compare(candidate_links, df_A, df_B)

CPU times: user 44.4 s, sys: 906 ms, total: 45.3 s
Wall time: 45 s


In [7]:
features.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,soundex_surname,soundex_firstname,nysiis_surname,nysiis_firstname,last_name,first_name,address_1,address_2,suburb,postcode,state,date_of_birth,phone_number,ssn
person_id_A,person_id_B,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
002cf4ec-57d0-4ebf-a31b-88db4441ff2e,061b9c3f-afbe-41e5-923b-3de29a4e5b82,0,1,0,1,0.0,1.0,0.263158,0.225806,0.384615,0.2,0.25,0.0,0.333333,0.0
002cf4ec-57d0-4ebf-a31b-88db4441ff2e,081ec178-99a1-4895-b96e-7c03cf8bbfdc,0,1,0,1,0.577778,1.0,0.210526,0.032258,0.230769,0.4,1.0,0.0,0.333333,0.142857
002cf4ec-57d0-4ebf-a31b-88db4441ff2e,17b274b5-aa3f-43cc-96ae-21283b7d1ca5,0,1,0,1,0.588889,1.0,0.176471,0.032258,0.384615,0.2,0.25,0.0,0.333333,0.0
002cf4ec-57d0-4ebf-a31b-88db4441ff2e,1f70d4cd-3106-4d15-af9f-1617a43ca83f,0,1,0,1,0.455556,1.0,0.235294,0.193548,0.307692,0.2,0.25,0.0,0.083333,0.142857
002cf4ec-57d0-4ebf-a31b-88db4441ff2e,201c4dba-825a-42f2-b7a8-832b792af90b,0,0,0,0,0.577778,0.611111,0.083333,0.129032,0.153846,0.6,0.25,0.0,0.416667,0.571429


## Add Labels to Feature Vectors

In [25]:
df_ground_truth["ground_truth"] = df_ground_truth["ground_truth"].apply(lambda x: 1.0 if x else 0.0)

df_labeled_features = pd.merge(
    features,
    df_ground_truth,
    on=["person_id_A", "person_id_B"],
    how="left"
)

df_labeled_features["ground_truth"].fillna(0, inplace=True)
df_labeled_features.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,soundex_surname,soundex_firstname,nysiis_surname,nysiis_firstname,last_name,first_name,address_1,address_2,suburb,postcode,state,date_of_birth,phone_number,ssn,ground_truth
person_id_A,person_id_B,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
002cf4ec-57d0-4ebf-a31b-88db4441ff2e,061b9c3f-afbe-41e5-923b-3de29a4e5b82,0,1,0,1,0.0,1.0,0.263158,0.225806,0.384615,0.2,0.25,0.0,0.333333,0.0,0.0
002cf4ec-57d0-4ebf-a31b-88db4441ff2e,081ec178-99a1-4895-b96e-7c03cf8bbfdc,0,1,0,1,0.577778,1.0,0.210526,0.032258,0.230769,0.4,1.0,0.0,0.333333,0.142857,0.0
002cf4ec-57d0-4ebf-a31b-88db4441ff2e,17b274b5-aa3f-43cc-96ae-21283b7d1ca5,0,1,0,1,0.588889,1.0,0.176471,0.032258,0.384615,0.2,0.25,0.0,0.333333,0.0,0.0
002cf4ec-57d0-4ebf-a31b-88db4441ff2e,1f70d4cd-3106-4d15-af9f-1617a43ca83f,0,1,0,1,0.455556,1.0,0.235294,0.193548,0.307692,0.2,0.25,0.0,0.083333,0.142857,0.0
002cf4ec-57d0-4ebf-a31b-88db4441ff2e,201c4dba-825a-42f2-b7a8-832b792af90b,0,0,0,0,0.577778,0.611111,0.083333,0.129032,0.153846,0.6,0.25,0.0,0.416667,0.571429,0.0


## Separate Candidate Links into Train/Test

In [26]:
X = df_labeled_features.drop("ground_truth", axis=1)
y = df_labeled_features["ground_truth"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2
)

## Train Neural Net Classifier

In [27]:
def create_nn_linking_classifier():
    """Return a simple NN classifier for to train on candidate links."""
    
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(16, activation="relu", input_shape=(14,)),
        tf.keras.layers.Dense(8, activation="relu"),
        tf.keras.layers.Dense(4, activation="relu"),        
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])
    
    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics="accuracy"
    )
    
    return model

In [28]:
model = create_nn_linking_classifier()
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 16)                240       
_________________________________________________________________
dense_4 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_5 (Dense)              (None, 4)                 36        
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 5         
Total params: 417
Trainable params: 417
Non-trainable params: 0
_________________________________________________________________


In [29]:
model.fit(X_train, y_train, epochs=8)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x1b7d24ca0>

## Predict Using NN Classifier

In [30]:
y_pred = model.predict(X_test)

In [31]:
df_predictions = X_test.copy()
df_predictions["model_score"] = y_pred
df_predictions["ground_truth"] = y_test

## Choosing a Linking Model Score Threshold

### Model Score Distribution

In [32]:
tutorial.plot_model_score_distribution(df_predictions)

### Precision and Recall vs. Model Score

In [33]:
blocking_eval, df_eval, df_top_links, df_bottom_links = tutorial.evaluate_linking(
    df=df_predictions,
    df_true_links=df_ground_truth,
    df_left=df_A,
    df_right=df_B,
)

In [34]:
tutorial.plot_precision_recall_vs_threshold(df_eval)

### Top Scoring `k` Links

In [35]:
display_cols = [
    "first_name", "surname",
    "street_number", "address_1", "address_2", "suburb", "postcode", "state",
    "date_of_birth", "age", "phone_number", "soc_sec_id",
    "soundex_surname", "soundex_firstname",
    "nysiis_surname", "nysiis_firstname",
]

display_cols = [[f"{col}_x", f"{col}_y"] for col in display_cols]
display_cols = list(itertools.chain.from_iterable(display_cols))

In [36]:
with pd.option_context('display.max_columns', None):
    display(df_top_links[["person_id_A", "person_id_B", "model_score", "ground_truth"] + display_cols])

Unnamed: 0,person_id_A,person_id_B,model_score,ground_truth,first_name_x,first_name_y,surname_x,surname_y,street_number_x,street_number_y,address_1_x,address_1_y,address_2_x,address_2_y,suburb_x,suburb_y,postcode_x,postcode_y,state_x,state_y,date_of_birth_x,date_of_birth_y,age_x,age_y,phone_number_x,phone_number_y,soc_sec_id_x,soc_sec_id_y,soundex_surname_x,soundex_surname_y,soundex_firstname_x,soundex_firstname_y,nysiis_surname_x,nysiis_surname_y,nysiis_firstname_x,nysiis_firstname_y
0,5dddffa3-f575-4b5b-9e2f-240c2e615baf,494d290b-4772-44a0-a5bd-3536884d949d,1.0,1.0,james,jimmke,finlay,fimlau,16,16,cashion court,cashion court,,,west wollongong,west wollongong,3012,3012,sa,sa,19620626.0,19620626.0,,,08 41796533,08 41796533,2116699,2116699,F540,F540,J520,J520,FANLY,FANL,JAN,JANC
1,bf36c20c-2695-4c29-8d34-13770bbd13f5,677dde0f-702b-4940-9d21-6f4db3a4a30e,1.0,1.0,samantha,samanyha,campbell,campbell,8,2,allman circuit,allman circuit,,,boulder,bousder,4217,4217,sa,sa,19990907.0,19990907.0,10.0,10.0,02 18386674,02 18386674,7950294,7950294,C514,C514,S553,S550,CANPBAL,CANPBAL,SANANT,SANANY
2,96d624bd-1fec-46cb-b02e-1a25ff4147f9,12fe190f-1370-413b-aea0-e7099e985639,1.0,1.0,dylan,dyln,garnett,garnett,36,36,,,,,mount torrens,mount torrens,3608,3680,vic,vic,19760504.0,19706004.0,38.0,38.0,07 05855936,07 05855936,8816524,8816524,G653,G653,D450,D450,GARNAT,GARNAT,DYLAN,DYLN
3,58226b7d-5c99-4e21-8d56-924459c250d8,143d9020-cf6b-4a8e-892f-6642791a8022,1.0,1.0,ryley,rlyry,garcia,garcia,73,73,blackbutt street,blackbutt street,,,villawood,villawood,6148,6148,nsw,nsw,,,,,08 16002307,08 76334766,2136157,2136157,G620,G620,R400,R460,GARC,GARC,RYLY,RLYRY
4,30f1b962-441a-41e2-82b0-723131ddf1c3,9f86b52a-09b8-4fa2-b059-cac534705345,1.0,1.0,liam,liam,mcgregor,mcgernor,27,27,kavel street,kavel street,,,leppington,leppington,4883,4883,vic,vic,19861115.0,19861115.0,,,08 43749772,08 43749772,7315231,4800278,M262,M265,L500,L500,MCGRAGAR,MCGARNAR,LAN,LAN
5,019cb351-74f6-4659-9d20-5294af088d3f,fba31d33-5f34-4c07-b012-6874edbabc08,1.0,1.0,alisha,rapson,rapson,aludha,1,1,callaway crescent,callaway crescent,merlewood,merlewood,currajong,currajong,2160,2160,tas,tas,,,28.0,28.0,04 41588682,04 41588682,5026102,5026102,R125,A430,A420,R125,RAPSAN,ALAD,ALAS,RAPSAN
6,d0c7073d-e643-4c46-a3db-c2adddebbac8,dbe67ebf-d0b5-43a7-828c-7a2aca3ec566,1.0,1.0,rhiannon,rhiannon,navarro,navarr,14,14,carnall close,carnalwclose,,,corio,corio,4873,4873,nsw,nsk,,,,,02 94999881,02 94999881,1170321,1170321,N160,N160,R550,R550,NAVAR,NAVAR,RANAN,RANAN
7,74e6e146-e0b2-4059-9f5a-89b603b3e209,de98090c-78de-49bc-bb64-854fcdfad2db,1.0,1.0,tiana,tiana,corbin,corvin,16,16,zox circuit,zox circuit,,,shellharbour,shellharbour,6066,3088,qld,qld,19220206.0,19220206.0,31.0,13.0,02 69598973,02 69538973,8178224,8178224,C615,C615,T500,T500,CARBAN,CARVAN,TAN,TAN
8,7d78981b-da9a-456d-83c2-dfa8a842a786,d76c3859-1fb3-4e8d-8e21-8aa31bb80cdf,1.0,1.0,aleisha,aleisha,kalnins,kalnns,14,13,fleay place,fleay plsce,,,murarrie,murarre,2655,2655,nsw,nsw,19540327.0,19540327.0,,,08 42674904,08 42674904,8567884,8567884,K455,K452,A420,A420,CALNAN,CALN,ALAS,ALAS
9,6200fe8c-49f9-4d89-9a98-c877dec5b778,a264bbe3-2bc0-4464-af35-2de52b4aebb5,1.0,1.0,jordan,jordan,maier,maiero,44,44,shumack street,shumack street,,colbara,orange,oranfe,5070,5070,,,19771128.0,19717128.0,6.0,6.0,04 89787858,04 89787858,8916980,8916980,M600,M600,J635,J635,MAR,MAR,JARDAN,JARDAN


### Bottom Scoring `k` Links

In [37]:
with pd.option_context('display.max_columns', None):
    display(df_bottom_links[["person_id_A", "person_id_B", "model_score", "ground_truth"] + display_cols])

Unnamed: 0,person_id_A,person_id_B,model_score,ground_truth,first_name_x,first_name_y,surname_x,surname_y,street_number_x,street_number_y,address_1_x,address_1_y,address_2_x,address_2_y,suburb_x,suburb_y,postcode_x,postcode_y,state_x,state_y,date_of_birth_x,date_of_birth_y,age_x,age_y,phone_number_x,phone_number_y,soc_sec_id_x,soc_sec_id_y,soundex_surname_x,soundex_surname_y,soundex_firstname_x,soundex_firstname_y,nysiis_surname_x,nysiis_surname_y,nysiis_firstname_x,nysiis_firstname_y
0,fb114032-7f14-4efd-91c8-d3ca7cc56f25,76ffced6-c4ca-46de-a3fe-f8b164515bf5,4.6693610000000004e-17,0.0,madison,chloe,dugdale,wottro,6.0,93.0,,perrin cidcuit,balena farm,st francis vlge,aspendale gardens,,5065,2749,qld,,,,34,32.0,08 51160660,,7482626,6611480,D234,W360,M325,C400,DAGDAL,WATR,MADASAN,CL
1,f7ba245e-f13b-41da-968d-ad1db7b6de32,a34922f6-e96a-476c-9fd1-be582a8a2759,4.6984490000000006e-17,0.0,katherine,jaspev,orzelek,fabbro,42.0,187.0,moorehead place,,,,judbury,goonellabah,3141,2680,vic,sa,,,29,26.0,07 80406596,,6333363,1845086,O624,F160,K365,J211,ORSALAC,FABR,CATARAN,JASPAF
2,3e54266a-f481-442c-bc62-faae22053b37,f80c4b3e-5911-4f8e-b8e8-dae7aa841a5c,5.096307e-17,0.0,isabella,amy,burdin,leaver,21.0,34.0,clemenger street,,,,franklin,leichhxrdt,6163,2221,vic,qld,,,9,37.0,,07 33436744,8243607,9831949,B635,L160,I214,A500,BARDAN,LAVAR,ISABAL,ANY
3,477db918-ca95-4c5f-a4b4-c655552151dd,2cc1490a-9567-4356-9b81-a479ec0d39ce,5.151131e-17,0.0,jordan,jodl,,blackewll,10.0,27.0,valder place,,,,winmalee,kyabram,5049,3977,qld,vif,,,29,34.0,,03 82219196,5686855,2947694,,B424,J635,J340,,BLACAEL,JARDAN,JADL
4,40d2c28c-eb73-4f75-abe6-f56d262a4ee3,52681cb9-4323-4703-97a9-503e934d09c5,5.2804330000000006e-17,0.0,hayden,cameon,macha,bennwll,45.0,11.0,darby street,clement place,kinjibi,,thornlie,peakhurst,3230,6157,nsw,nsw,,,37,35.0,07 11997647,,5882785,9241100,M200,B540,H350,C550,MC,BANWL,HAYDAN,CANAN
5,96ea5263-823b-40a2-997c-01c17c9303b7,d112cf57-1d73-4f11-94bb-5faa7270698e,5.3547760000000005e-17,0.0,declen,jayde,kiss,tregoning,33.0,46.0,flecker place,,,,bonny hills,wahroonga,5025,2130,qld,nsw,,,25,21.0,,08 71715612,9377051,2825385,K200,T625,D245,J300,C,TRAGANANG,DACLAN,JAYD
6,34b64f7d-a03a-41ea-a92a-843519337049,69f6e2ff-f3ea-41e7-ae1e-052b0a46ba2d,5.4029330000000003e-17,0.0,talyah,emalene,roche,lewal,,19.0,de graaff street,,,,keiraville,burpengary,2281,2096,nsw,nsw,,,11,,03 87978274,,7416811,5838976,R200,L400,T400,E545,RAC,LAEAL,TALY,ENALAN
8,75e10765-371f-4fbe-b11c-83b898b3f399,69f6e2ff-f3ea-41e7-ae1e-052b0a46ba2d,6.524719e-17,0.0,courtney,emalene,white,lewal,13.0,19.0,druitt place,,,,concord west,burpengary,4051,2096,nsw,nsw,,,7,,08 12732429,,2601647,5838976,W300,L400,C635,E545,WAT,LAEAL,CARTNY,ENALAN
7,31213616-f2a9-40f8-a269-dabe17490718,a31b3b72-f66f-4a63-9e01-2b303c2d94ac,5.4163670000000006e-17,0.0,hamish,joxhua,lette,prodw,63.0,173.0,mcphail place,,,,scarborough,mona vale,7015,2672,vic,,,,35,29.0,03 65818771,,7655525,1978244,L300,P630,H520,J200,LAT,PRADW,HAN,JAX
9,d3514297-bbdc-469e-8f2d-29d3bf76fca4,488ba2fb-7b4e-4b88-bfaa-65f9d071c342,6.591766000000001e-17,0.0,kasey,calev,boss,weelr,14.0,,rennie street,vincent place,,,wongaling beach,,2170,3226,vic,vic,,,20,41.0,,04 63685334,2601961,5457685,B200,W460,K200,C410,B,WALR,CASY,CALAF
