# Create training data files based on FEBRL-generated dataset

In [1]:
import os
import pandas as pd
import pathlib
import re
import uuid

from typing import Tuple, Optional

## Path constants

In [2]:
DATA_DIR = pathlib.Path(os.path.abspath('')).parent / "data"

FEBRL_FILE = DATA_DIR / "febrl_training_dataset.csv"

TRAINING_DATASET_A = DATA_DIR / "febrl_training_a.csv"
TRAINING_DATASET_B = DATA_DIR / "febrl_training_b.csv"
TRAINING_LABELS = DATA_DIR / "febrl_training_labels.csv"

## Read in FEBRL file

In [3]:
df_febrl = pd.read_csv(FEBRL_FILE)
df_febrl.columns = [x.strip() for x in df_febrl.columns]
df_febrl.drop("blocking_number", axis=1, inplace=True)
df_febrl.rename(columns={'given_name': 'first_name'}, inplace=True)

df_febrl["first_name"] = df_febrl["first_name"].apply(lambda x: x.strip())
df_febrl["surname"] = df_febrl["surname"].apply(lambda x: x.strip())

# Generate a random uuid for each row.
df_febrl["person_id"] = df_febrl["rec_id"].apply(lambda x: str(uuid.uuid4()))

df_febrl.head()

Unnamed: 0,rec_id,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id,person_id
0,rec-3714-dup-0,charlotte,leukg,301,domain street,locn 1699,alma bay,2710,vic,,29.0,07 05109263,6356142,49bf7b37-c6af-41c8-91d7-5eb64c496a6c
1,rec-3675-dup-0,callie,heerscgap,23,dudi lzce,,mill park,2324,tas,19820623.0,9.0,02 82637596,6775114,304b9d58-b06a-4d1c-970b-020e81efd1ff
2,rec-3399-dup-0,alanx,nguyen,6,callaghan street,,albury,4575,nsw,19220115.0,27.0,08 82171717,5275665,c080c996-dbef-4ec8-aa0d-0150629cd367
3,rec-298-org,dakota,geraghty,69,maclean street,skeers property,dandenong north,2529,nsw,19380417.0,31.0,03 01783133,6629995,f343cef9-cef0-445f-b688-972db2a029ca
4,rec-240-org,james,colquhoun,118,conlon crescent,,birkdale,5043,nsw,19680112.0,,07 14327140,5350518,2b5d49ca-06a5-468f-867e-0630bb7222f4


### Pull out id data to make training labels

In [4]:
original_record_pattern = r"rec-(\d+)-org"
dupe_record_pattern = r"rec-(\d+)-dup-(\d+)"

def parse_rec_id(rec_id: str) -> Tuple[int, bool, Optional[int]]:
    """Returns record id, is dupe, dupe number or None"""
    
    if m := re.match(original_record_pattern, rec_id):
        return [int(m.group(1)), "A", None]
    elif m := re.match(dupe_record_pattern, rec_id):
        return [int(m.group(1)), "B", m.group(2)]
    else:
        raise Exception(f"Unable to parse rec_id: {rec_id}")
        
df_febrl[["febrl_id", "dataset", "dupe_no"]] = df_febrl.apply(lambda x: pd.Series(parse_rec_id(x["rec_id"])), axis=1)

In [5]:
df_febrl[["rec_id", "person_id", "febrl_id", "dataset", "dupe_no"]].sort_values("febrl_id")

Unnamed: 0,rec_id,person_id,febrl_id,dataset,dupe_no
1050,rec-0-dup-0,8daea3a6-e54d-4cdb-9962-985eaa6f6839,0,B,0
1460,rec-0-org,99945930-2ee8-4b4b-be7e-4f6e196b4ae4,0,A,
8187,rec-1-org,d18a65af-9cfc-46df-a8ee-565095125bf6,1,A,
3699,rec-1-dup-0,b0f8c021-c43d-436c-8236-a3623223d91c,1,B,0
4739,rec-2-dup-0,d2970d4f-1601-4aa6-99b0-c79898bab323,2,B,0
...,...,...,...,...,...
2635,rec-4997-org,e1a23f4e-e4de-486f-a049-c236baa67059,4997,A,
7720,rec-4998-dup-0,8d886fa4-c6d7-40ea-9f9a-a3a29b9e26b0,4998,B,0
997,rec-4998-org,fe8d397a-9735-4ea5-9cde-0ba97fe1f512,4998,A,
3729,rec-4999-dup-0,b5f06d26-8065-4f82-9518-60e74ee09b25,4999,B,0


In [6]:
unique_febrl_ids = sorted(list(df_febrl["febrl_id"].unique()))

training_labels = []
for febrl_id in unique_febrl_ids:
    person_id_A = df_febrl[(df_febrl["febrl_id"] == febrl_id) & (df_febrl["dataset"] == "A")].iloc[0]["person_id"]
    person_id_B = df_febrl[(df_febrl["febrl_id"] == febrl_id) & (df_febrl["dataset"] == "B")].iloc[0]["person_id"]

    training_labels.append({"person_id_A" : person_id_A, "person_id_B" : person_id_B, "label" : 1})

In [7]:
df_labels = pd.DataFrame(training_labels)

df_labels.to_csv(TRAINING_LABELS, index=False)
df_labels.head()

Unnamed: 0,person_id_A,person_id_B,label
0,99945930-2ee8-4b4b-be7e-4f6e196b4ae4,8daea3a6-e54d-4cdb-9962-985eaa6f6839,1
1,d18a65af-9cfc-46df-a8ee-565095125bf6,b0f8c021-c43d-436c-8236-a3623223d91c,1
2,c4336ddb-8b50-4f8a-aa93-3e27478f909a,d2970d4f-1601-4aa6-99b0-c79898bab323,1
3,7cbb6367-5268-49fe-83a9-053ddfb0f2f8,e817653e-f486-4d70-9de5-5ce3fe1fac36,1
4,7622a53c-e004-48b5-87ab-9cdf4d84f186,2fe290ca-d919-463f-bd33-e60214ec2834,1


### Drop matching id fields from training data

In [8]:
DATASET_COLUMNS = [
    "person_id", 'first_name', 'surname', 'street_number', 'address_1',
    'address_2', 'suburb', 'postcode', 'state', 'date_of_birth', 'age',
    'phone_number', 'soc_sec_id'
]

In [9]:
df_febrl = df_febrl.drop(["rec_id", "febrl_id", "dupe_no"], axis=1)
df_febrl = df_febrl[["dataset"] + DATASET_COLUMNS]

In [10]:
df_febrl.head()

Unnamed: 0,dataset,person_id,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id
0,B,49bf7b37-c6af-41c8-91d7-5eb64c496a6c,charlotte,leukg,301,domain street,locn 1699,alma bay,2710,vic,,29.0,07 05109263,6356142
1,B,304b9d58-b06a-4d1c-970b-020e81efd1ff,callie,heerscgap,23,dudi lzce,,mill park,2324,tas,19820623.0,9.0,02 82637596,6775114
2,B,c080c996-dbef-4ec8-aa0d-0150629cd367,alanx,nguyen,6,callaghan street,,albury,4575,nsw,19220115.0,27.0,08 82171717,5275665
3,A,f343cef9-cef0-445f-b688-972db2a029ca,dakota,geraghty,69,maclean street,skeers property,dandenong north,2529,nsw,19380417.0,31.0,03 01783133,6629995
4,A,2b5d49ca-06a5-468f-867e-0630bb7222f4,james,colquhoun,118,conlon crescent,,birkdale,5043,nsw,19680112.0,,07 14327140,5350518


In [11]:
display(df_febrl[df_febrl["first_name"].isna()])
display(df_febrl[df_febrl["surname"].isna()])

Unnamed: 0,dataset,person_id,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id


Unnamed: 0,dataset,person_id,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id


## Separate dataset A (original records) and dataset B (dupes)

### Dataset A

In [12]:
df_A = df_febrl[df_febrl["dataset"] == "A"].copy()
df_A.reset_index(inplace=True, drop=True)
df_A = df_A[DATASET_COLUMNS]
df_A.rename(columns={'person_id': 'person_id_A'}, inplace=True)

df_A.to_csv(TRAINING_DATASET_A, index=False)

df_A.head()

Unnamed: 0,person_id_A,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id
0,f343cef9-cef0-445f-b688-972db2a029ca,dakota,geraghty,69,maclean street,skeers property,dandenong north,2529,nsw,19380417.0,31.0,03 01783133,6629995
1,2b5d49ca-06a5-468f-867e-0630bb7222f4,james,colquhoun,118,conlon crescent,,birkdale,5043,nsw,19680112.0,,07 14327140,5350518
2,fabd142c-9269-4f52-9899-3a82cccfe9e8,ruby,butt,103,,wollartukkee,east fremantle,4814,wa,19430120.0,30.0,02 88839517,3225206
3,1f719d2e-c842-49c0-ade9-c265f70288ae,marcus,rees,5,charlick place,lindoran,ballarat,4216,nsw,,27.0,08 17239266,7355062
4,7859d4fb-04fc-46fd-aa0f-5955603d35d9,jassim,belperio,36,john russell circuit,,eastwood,3131,nsw,19460129.0,20.0,02 61510457,9190750


### Dataset B

In [13]:
df_B = df_febrl[df_febrl["dataset"] == "B"].copy()
df_B.reset_index(inplace=True, drop=True)
df_B = df_B[DATASET_COLUMNS]
df_B.rename(columns={'person_id': 'person_id_B'}, inplace=True)

df_B.to_csv(TRAINING_DATASET_B, index=False)

df_B.head()

Unnamed: 0,person_id_B,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id
0,49bf7b37-c6af-41c8-91d7-5eb64c496a6c,charlotte,leukg,301,domain street,locn 1699,alma bay,2710,vic,,29.0,07 05109263,6356142
1,304b9d58-b06a-4d1c-970b-020e81efd1ff,callie,heerscgap,23,dudi lzce,,mill park,2324,tas,19820623.0,9.0,02 82637596,6775114
2,c080c996-dbef-4ec8-aa0d-0150629cd367,alanx,nguyen,6,callaghan street,,albury,4575,nsw,19220115.0,27.0,08 82171717,5275665
3,1c1d7e32-a925-47ab-9fa2-d6fe95f87de6,willjam,dud,83,purbrick street,glenveagh,muttabrra,6100,,19871212.0,23.0,07 54557966,7073899
4,80ea777d-6088-4d82-b839-2b566091d61a,lucy,baillie,34,hurley street,,glen iqnnes,5038,sa,19310448.0,,08 19431835,6880723
