# Create training data files based on FEBRL-generated dataset

In [1]:
import os
import pandas as pd
import pathlib
import re
import uuid

from typing import Tuple, Optional

## Path constants

In [2]:

DATA_DIR = pathlib.Path(os.path.abspath('')).parent / "data" / "training"
ORIGINALS_DATA_DIR = pathlib.Path(os.path.abspath('')).parent / "data" / "training" / "originals"


FEBRL_FILE = ORIGINALS_DATA_DIR / "febrl_training_uniform.csv"

TRAINING_DATASET_A = DATA_DIR / "febrl_training_a.csv"
TRAINING_DATASET_B = DATA_DIR / "febrl_training_b.csv"
TRAINING_LABELS = DATA_DIR / "febrl_training_labels.csv"

In [3]:
import febrl_data_transform as transform

In [4]:
df_test = transform.transform_febrl_dataset_without_dupes(ORIGINALS_DATA_DIR / "febrl_training_extras.csv")

In [None]:

#     df_A = pd.concat([df_A, df_extra.iloc[0: df_extra.shape[0]/2])
#     df_B = pd.concat([df_B, df_extra.iloc[df_extra.shape[0]/2]:)

In [18]:
xtra_rows = int(df_test.shape[0]/2)

In [19]:
df_test.iloc[xtra_rows:]

Unnamed: 0,person_id_A,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id
500,fab7ff66-a08d-4406-82c2-1ae4bdafe8db,kayla,neville,1,durack street,,taree,2776,nsw,19830619,32,,8177068
501,8fd4b0db-356b-4f6b-81e6-d431c2e3dba2,ashton,minchinton,3,conyers street,,raceview,2380,nsw,19210603,,03 46899033,9734769
502,4827111f-5624-46ae-a3d5-922527fa253c,casey,gerritsen,476,clisby close,,dianella,2121,vic,19510513,28,03 97204280,9983591
503,9df6bf3e-b41b-4fe0-85d9-493d29b13612,jasper,crook,127,knox street,,bonnyrigg,4300,vic,,,02 48147007,3684913
504,9bbc2e4e-2809-4dee-9c3f-01b7c91507c4,noah,ang,4,glenorchy street,,highgate,2904,wa,19741027,32,07 70140727,8669512
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,99b1ff78-d8fe-4008-8922-67bf4ba69289,hannah,ciotti,47,cazaly close,,orbost,4152,nsw,19210405,13,07 46352117,5397686
996,328732dc-c3d6-4bee-9f98-1fcd346ee742,jack,flores,9,schomburgk street,eulalia,old bar,2700,vic,19441114,8,07 02898680,4899792
997,8c9bb76f-ccdc-4193-936e-643a939ca423,xavier,sindicic,10,foxlow close,flt 17,camden,7015,qld,19830908,33,02 73588756,2900965
998,065f9c5f-34e4-4a52-898c-4e08ece110a2,sophie,robson,7,bertram street,,belmont,3095,wa,19361102,,04 78716646,9466267


In [16]:
df_test.iloc[0: int(df_test.shape[0]/2)]

Unnamed: 0,person_id_A,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id
0,b30f670a-183a-4a7c-a85d-3f697e56c9a2,kayden,dixon,4,parsons street,,padstow,4228,qld,,28,08 74564411,4442264
1,0f465470-9bfb-4a0f-b512-de3f0972776c,charlotte,lund,12,bennelong crescent,,geraldton,3285,qld,19971016,32,04 58771484,7836271
2,6826eb32-6864-40ce-92a9-203d3a022a82,ethan,narayana,20,bourne street,,highton,3053,wa,,28,02 87436166,2232163
3,b16b18e9-23af-4a85-a656-187141d0d7ec,lynae,noble,1,kevin street,villa 2,kingston,3550,qld,19450703,26,07 08895983,8748377
4,cd8fcad0-3911-4815-a0dd-a4ea31031da3,charli,nguyen,16,blackman crescent,sea vista,ulladulla,3400,nsw,19311125,27,03 66171060,4547718
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,80d1bf28-c98a-4b77-924a-987f90ed9399,kyle,swoboda,45,jansz crescent,,hawthorn,2611,,19770720,25,03 66052192,7928376
496,ec997e6a-2806-4864-9875-8d2f58c78c54,brett,sherriff,527,moorhouse street,bulala,brookvale,2486,nsw,19310624,,07 18929737,6497354
497,5a92e16f-e85f-490e-8136-2e47c1f4414d,connor,,9,bussau close,villa 18,mosman,3165,nsw,,31,08 68311821,4345603
498,3d7d16a4-367f-4b03-b8d7-6dfbb8c8ef58,ella,white,7,britten-jones drive,,kurri kurri,4036,qld,19880909,10,02 84225519,1353202


In [7]:
df_A, df_B, df_labels = transform.transform_febrl_dataset_with_dupes(ORIGINALS_DATA_DIR / "febrl_training_zipf.csv")

In [11]:
display(df_A.shape)
df_A.head()

(1000, 13)

Unnamed: 0,person_id_A,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id
0,88df5dce-0551-43e2-828d-ae24f1e39f08,emiily,kumaran,16,sue geh circuit,esk farm,sanctuary point,2150,vic,19861128,28,04 02622282,9556223
1,407419b8-572f-4715-9d75-87fb639ea910,joshua,bishop,17,mccabe crescent,,ringwood north,3218,vic,19670519,30,08 64584488,3397466
2,50bc4c39-a97e-4a26-91bd-1c2b2cc2cf3e,isabella,plane,22,lupus place,,mount fairy,3814,vic,19371023,25,02 16137900,8477984
3,8aec5046-5c17-4f4c-9921-5e3a26d4fcb1,lily,grainger,9,,,toowoomba,3280,nsw,19781222,27,07 22885956,6290962
4,b7cd00d2-a854-40f8-879f-0141bb1413ab,caleb,kerslake,14,heard street,,garbutt,3042,nsw,19470824,34,02 88628933,5565684


In [10]:
display(df_B.shape)
df_B.head()

(1000, 13)

Unnamed: 0,person_id_B,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id
0,3550be68-aee9-44f5-b771-f73ced50e289,anita,tib,78,spalding street,,wendouree,5108,qld,19841930.0,,08 14725591,2023285
1,9d517e68-9606-42db-9b40-a1aabe8e3388,ben,sherriff,1,woronora street,,richardson,3280,nsw,19691211.0,31.0,07 54556761,7107899
2,06a13070-7e72-4610-8f0d-9adfb47b7d30,wililaj,campbell,83,arthur circle,,st ives,2217,,19430405.0,,07 92121169,6901877
3,bf0878e0-2cb2-4f8c-9e3e-e14b0e64c000,olile,cambdll,7,von guerard crescent,central highlands medical centre,craigif,2027,nsw,,31.0,04 81186909,8914810
4,86aa6022-2ee5-4a40-8a6e-a736533eb97a,sarah,morridon,356,gibb place,garden settlement,noble park,2526,qld,19760526.0,44.0,04 03466376,5492033


In [12]:
df_labels.shape

(1000, 3)

## Read in FEBRL file

In [3]:
df_febrl = pd.read_csv(FEBRL_FILE)
df_febrl.columns = [x.strip() for x in df_febrl.columns]
df_febrl.drop("blocking_number", axis=1, inplace=True)
df_febrl.rename(columns={'given_name': 'first_name'}, inplace=True)

df_febrl["first_name"] = df_febrl["first_name"].apply(lambda x: x.strip())
df_febrl["surname"] = df_febrl["surname"].apply(lambda x: x.strip())

# Generate a random uuid for each row.
df_febrl["person_id"] = df_febrl["rec_id"].apply(lambda x: str(uuid.uuid4()))

df_febrl.head()

Unnamed: 0,rec_id,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id,person_id
0,rec-3714-dup-0,charlotte,leukg,301,domain street,locn 1699,alma bay,2710,vic,,29.0,07 05109263,6356142,af1c5486-9593-46fa-a778-2a758811c693
1,rec-3675-dup-0,callie,heerscgap,23,dudi lzce,,mill park,2324,tas,19820623.0,9.0,02 82637596,6775114,c21ddbd5-45f0-4854-9c06-b6b8b95ab1ec
2,rec-3399-dup-0,alanx,nguyen,6,callaghan street,,albury,4575,nsw,19220115.0,27.0,08 82171717,5275665,a2a86900-6dca-45e7-b847-dac0add9da90
3,rec-298-org,dakota,geraghty,69,maclean street,skeers property,dandenong north,2529,nsw,19380417.0,31.0,03 01783133,6629995,fc543392-3ba7-4cd9-be43-83a9779bd629
4,rec-240-org,james,colquhoun,118,conlon crescent,,birkdale,5043,nsw,19680112.0,,07 14327140,5350518,8d12d625-9e7b-4b79-b587-d546a5a8fdc8


### Pull out id data to make training labels

In [4]:
original_record_pattern = r"rec-(\d+)-org"
dupe_record_pattern = r"rec-(\d+)-dup-(\d+)"

def parse_rec_id(rec_id: str) -> Tuple[int, bool, Optional[int]]:
    """Returns record id, is dupe, dupe number or None"""
    
    if m := re.match(original_record_pattern, rec_id):
        return [int(m.group(1)), "A", None]
    elif m := re.match(dupe_record_pattern, rec_id):
        return [int(m.group(1)), "B", m.group(2)]
    else:
        raise Exception(f"Unable to parse rec_id: {rec_id}")
        
df_febrl[["febrl_id", "dataset", "dupe_no"]] = df_febrl.apply(lambda x: pd.Series(parse_rec_id(x["rec_id"])), axis=1)

In [5]:
df_febrl[["rec_id", "person_id", "febrl_id", "dataset", "dupe_no"]].sort_values("febrl_id")

Unnamed: 0,rec_id,person_id,febrl_id,dataset,dupe_no
1050,rec-0-dup-0,50653c69-d5e8-45e2-81e9-777b479f5cbd,0,B,0
1460,rec-0-org,2518f221-41ca-488a-8ae8-8292919c18a2,0,A,
8187,rec-1-org,63d67af9-887c-4f01-906e-3b02aaee6e05,1,A,
3699,rec-1-dup-0,247381c9-6fef-4c57-8fba-8d46b3d262f5,1,B,0
4739,rec-2-dup-0,7d8dfe0a-d742-4f19-a2c2-f5de66efdb0f,2,B,0
...,...,...,...,...,...
2635,rec-4997-org,e417effd-067f-4c47-b37c-10e34c4d737a,4997,A,
7720,rec-4998-dup-0,ca332255-9025-40cb-b107-c044251c1bd5,4998,B,0
997,rec-4998-org,bd5cd40c-1e48-42a6-b6df-1fbe91f9cbf3,4998,A,
3729,rec-4999-dup-0,8521b26e-2add-43c5-b4f8-fd2984c22d20,4999,B,0


In [6]:
unique_febrl_ids = sorted(list(df_febrl["febrl_id"].unique()))

training_labels = []
for febrl_id in unique_febrl_ids:
    person_id_A = df_febrl[(df_febrl["febrl_id"] == febrl_id) & (df_febrl["dataset"] == "A")].iloc[0]["person_id"]
    person_id_B = df_febrl[(df_febrl["febrl_id"] == febrl_id) & (df_febrl["dataset"] == "B")].iloc[0]["person_id"]

    training_labels.append({"person_id_A" : person_id_A, "person_id_B" : person_id_B, "ground_truth" : 1})

In [7]:
df_labels = pd.DataFrame(training_labels)

df_labels.to_csv(TRAINING_LABELS, index=False)
df_labels.head()

Unnamed: 0,person_id_A,person_id_B,ground_truth
0,2518f221-41ca-488a-8ae8-8292919c18a2,50653c69-d5e8-45e2-81e9-777b479f5cbd,1
1,63d67af9-887c-4f01-906e-3b02aaee6e05,247381c9-6fef-4c57-8fba-8d46b3d262f5,1
2,3ea59f35-9a5e-4348-a822-307eb565f75c,7d8dfe0a-d742-4f19-a2c2-f5de66efdb0f,1
3,1c7b9315-3ad0-42a0-8576-a95ac882e3da,126612a1-2bcc-42d5-9d18-a340970a4b47,1
4,4a7300de-184a-47f3-8bd1-7c9be8c10b15,2cafeb08-703a-4fb5-9e21-27a05cf32d19,1


### Drop matching id fields from training data

In [8]:
DATASET_COLUMNS = [
    "person_id", 'first_name', 'surname', 'street_number', 'address_1',
    'address_2', 'suburb', 'postcode', 'state', 'date_of_birth', 'age',
    'phone_number', 'soc_sec_id'
]

In [9]:
df_febrl = df_febrl.drop(["rec_id", "febrl_id", "dupe_no"], axis=1)
df_febrl = df_febrl[["dataset"] + DATASET_COLUMNS]

In [10]:
df_febrl.head()

Unnamed: 0,dataset,person_id,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id
0,B,af1c5486-9593-46fa-a778-2a758811c693,charlotte,leukg,301,domain street,locn 1699,alma bay,2710,vic,,29.0,07 05109263,6356142
1,B,c21ddbd5-45f0-4854-9c06-b6b8b95ab1ec,callie,heerscgap,23,dudi lzce,,mill park,2324,tas,19820623.0,9.0,02 82637596,6775114
2,B,a2a86900-6dca-45e7-b847-dac0add9da90,alanx,nguyen,6,callaghan street,,albury,4575,nsw,19220115.0,27.0,08 82171717,5275665
3,A,fc543392-3ba7-4cd9-be43-83a9779bd629,dakota,geraghty,69,maclean street,skeers property,dandenong north,2529,nsw,19380417.0,31.0,03 01783133,6629995
4,A,8d12d625-9e7b-4b79-b587-d546a5a8fdc8,james,colquhoun,118,conlon crescent,,birkdale,5043,nsw,19680112.0,,07 14327140,5350518


## Separate dataset A (original records) and dataset B (dupes)

### Dataset A

In [11]:
df_A = df_febrl[df_febrl["dataset"] == "A"].copy()
df_A.reset_index(inplace=True, drop=True)
df_A = df_A[DATASET_COLUMNS]
df_A.rename(columns={'person_id': 'person_id_A'}, inplace=True)

df_A.to_csv(TRAINING_DATASET_A, index=False)

df_A.head()

Unnamed: 0,person_id_A,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id
0,fc543392-3ba7-4cd9-be43-83a9779bd629,dakota,geraghty,69,maclean street,skeers property,dandenong north,2529,nsw,19380417.0,31.0,03 01783133,6629995
1,8d12d625-9e7b-4b79-b587-d546a5a8fdc8,james,colquhoun,118,conlon crescent,,birkdale,5043,nsw,19680112.0,,07 14327140,5350518
2,a7fa0b25-26ce-4d9b-b8d7-ea629b756466,ruby,butt,103,,wollartukkee,east fremantle,4814,wa,19430120.0,30.0,02 88839517,3225206
3,55b20159-d966-4a21-9687-3264824a6666,marcus,rees,5,charlick place,lindoran,ballarat,4216,nsw,,27.0,08 17239266,7355062
4,0d4f67ac-5fbc-4e6d-9918-34571e04e203,jassim,belperio,36,john russell circuit,,eastwood,3131,nsw,19460129.0,20.0,02 61510457,9190750


### Dataset B

In [12]:
df_B = df_febrl[df_febrl["dataset"] == "B"].copy()
df_B.reset_index(inplace=True, drop=True)
df_B = df_B[DATASET_COLUMNS]
df_B.rename(columns={'person_id': 'person_id_B'}, inplace=True)

df_B.to_csv(TRAINING_DATASET_B, index=False)

df_B.head()

Unnamed: 0,person_id_B,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id
0,af1c5486-9593-46fa-a778-2a758811c693,charlotte,leukg,301,domain street,locn 1699,alma bay,2710,vic,,29.0,07 05109263,6356142
1,c21ddbd5-45f0-4854-9c06-b6b8b95ab1ec,callie,heerscgap,23,dudi lzce,,mill park,2324,tas,19820623.0,9.0,02 82637596,6775114
2,a2a86900-6dca-45e7-b847-dac0add9da90,alanx,nguyen,6,callaghan street,,albury,4575,nsw,19220115.0,27.0,08 82171717,5275665
3,93d34db6-c81b-41eb-8992-da3e2de83e88,willjam,dud,83,purbrick street,glenveagh,muttabrra,6100,,19871212.0,23.0,07 54557966,7073899
4,753b6dec-7b97-4f99-9979-810f347d058a,lucy,baillie,34,hurley street,,glen iqnnes,5038,sa,19310448.0,,08 19431835,6880723
