# Create training data files based on FEBRL-generated dataset

In [1]:
import os
import pandas as pd
import pathlib
import re
import uuid

from typing import Tuple, Optional

## Path constants

In [2]:
WORKING_DIR = pathlib.Path(os.path.abspath(''))

FEBRL_FILE = WORKING_DIR / "test_dataset.csv"

TRAINING_DATASET_A = WORKING_DIR / "training_a.csv"
TRAINING_DATASET_B = WORKING_DIR / "training_b.csv"
TRAINING_LABELS = WORKING_DIR / "training_labels.csv"

## Read in FEBRL file

In [18]:
df_febrl = pd.read_csv(FEBRL_FILE)
df_febrl.columns = [x.strip() for x in df_febrl.columns]
df_febrl.drop("blocking_number", axis=1, inplace=True)
df_febrl.rename(columns={'given_name': 'first_name'}, inplace=True)

df_febrl["first_name"] = df_febrl["first_name"].apply(lambda x: x.strip())
df_febrl["surname"] = df_febrl["surname"].apply(lambda x: x.strip())

# Generate a random uuid for each row.
df_febrl["person_id"] = df_febrl["rec_id"].apply(lambda x: str(uuid.uuid4()))

df_febrl.head()

Unnamed: 0,rec_id,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id,person_id
0,rec-3714-dup-0,charlotte,leukg,301,domain street,locn 1699,alma bay,2710,vic,,29.0,07 05109263,6356142,c954056b-a8a6-4e25-a7a5-2984bcbb874e
1,rec-3675-dup-0,callie,heerscgap,23,dudi lzce,,mill park,2324,tas,19820623.0,9.0,02 82637596,6775114,306a9027-bdb1-4cf9-ac34-07fbd744d34a
2,rec-3399-dup-0,alanx,nguyen,6,callaghan street,,albury,4575,nsw,19220115.0,27.0,08 82171717,5275665,19b72493-6ebf-47d6-bbec-00eb1e343bde
3,rec-298-org,dakota,geraghty,69,maclean street,skeers property,dandenong north,2529,nsw,19380417.0,31.0,03 01783133,6629995,c4a4d5c9-4bcd-47a4-b621-595638b76717
4,rec-240-org,james,colquhoun,118,conlon crescent,,birkdale,5043,nsw,19680112.0,,07 14327140,5350518,ce79609d-33b4-423a-bdb4-6a6f0a1572a0


### Pull out id data to make training labels

In [19]:
original_record_pattern = r"rec-(\d+)-org"
dupe_record_pattern = r"rec-(\d+)-dup-(\d+)"

def parse_rec_id(rec_id: str) -> Tuple[int, bool, Optional[int]]:
    """Returns record id, is dupe, dupe number or None"""
    
    if m := re.match(original_record_pattern, rec_id):
        return [int(m.group(1)), "A", None]
    elif m := re.match(dupe_record_pattern, rec_id):
        return [int(m.group(1)), "B", m.group(2)]
    else:
        raise Exception(f"Unable to parse rec_id: {rec_id}")
        
df_febrl[["febrl_id", "dataset", "dupe_no"]] = df_febrl.apply(lambda x: pd.Series(parse_rec_id(x["rec_id"])), axis=1)

In [20]:
df_febrl[["rec_id", "person_id", "febrl_id", "dataset", "dupe_no"]].sort_values("febrl_id")

Unnamed: 0,rec_id,person_id,febrl_id,dataset,dupe_no
1050,rec-0-dup-0,1a084097-3004-484f-ad0e-b84b06a5bcb9,0,B,0
1460,rec-0-org,65d84bcc-3422-4929-9351-585f47614979,0,A,
8187,rec-1-org,09eff19d-c6b1-455c-8162-8158c8beca0b,1,A,
3699,rec-1-dup-0,27eed78b-aaa3-401d-a701-763495873e1b,1,B,0
4739,rec-2-dup-0,99ffc3cf-53dd-410d-b2b3-5f18d4d31341,2,B,0
...,...,...,...,...,...
2635,rec-4997-org,a81bbd00-ffb6-4a3b-8b3a-1809768ba521,4997,A,
7720,rec-4998-dup-0,8e4c24ee-1f20-4fcd-9389-9e6ed8cfac6c,4998,B,0
997,rec-4998-org,6d57a301-00fb-4afd-a0da-b0c6fa14f9e7,4998,A,
3729,rec-4999-dup-0,d97fa29f-6eb7-4eb0-9bcf-f06ef57ca39d,4999,B,0


In [21]:
unique_febrl_ids = sorted(list(df_febrl["febrl_id"].unique()))

training_labels = []
for febrl_id in unique_febrl_ids:
    person_id_A = df_febrl[(df_febrl["febrl_id"] == febrl_id) & (df_febrl["dataset"] == "A")].iloc[0]["person_id"]
    person_id_B = df_febrl[(df_febrl["febrl_id"] == febrl_id) & (df_febrl["dataset"] == "B")].iloc[0]["person_id"]

    training_labels.append({"person_id_A" : person_id_A, "person_id_B" : person_id_B, "label" : 1})

In [22]:
df_labels = pd.DataFrame(training_labels)

df_labels.to_csv(TRAINING_LABELS, index=False)
df_labels.head()

Unnamed: 0,person_id_A,person_id_B,label
0,65d84bcc-3422-4929-9351-585f47614979,1a084097-3004-484f-ad0e-b84b06a5bcb9,1
1,09eff19d-c6b1-455c-8162-8158c8beca0b,27eed78b-aaa3-401d-a701-763495873e1b,1
2,308bc53a-e7e7-4150-81de-b5094b63c196,99ffc3cf-53dd-410d-b2b3-5f18d4d31341,1
3,53ff2d35-a75f-4586-a5a6-1585c1c4dedd,722d1749-9bfb-43f8-9ab0-cd1f87f6ae16,1
4,674b7869-11c8-4106-8dd7-b892a668b993,fe16458e-fc6d-45b4-8071-cab0b7213d3a,1


### Drop matching id fields from training data

In [23]:
DATASET_COLUMNS = [
    "person_id", 'first_name', 'surname', 'street_number', 'address_1',
    'address_2', 'suburb', 'postcode', 'state', 'date_of_birth', 'age',
    'phone_number', 'soc_sec_id'
]

In [24]:
df_febrl = df_febrl.drop(["rec_id", "febrl_id", "dupe_no"], axis=1)
df_febrl = df_febrl[["dataset"] + DATASET_COLUMNS]

In [25]:
df_febrl.head()

Unnamed: 0,dataset,person_id,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id
0,B,c954056b-a8a6-4e25-a7a5-2984bcbb874e,charlotte,leukg,301,domain street,locn 1699,alma bay,2710,vic,,29.0,07 05109263,6356142
1,B,306a9027-bdb1-4cf9-ac34-07fbd744d34a,callie,heerscgap,23,dudi lzce,,mill park,2324,tas,19820623.0,9.0,02 82637596,6775114
2,B,19b72493-6ebf-47d6-bbec-00eb1e343bde,alanx,nguyen,6,callaghan street,,albury,4575,nsw,19220115.0,27.0,08 82171717,5275665
3,A,c4a4d5c9-4bcd-47a4-b621-595638b76717,dakota,geraghty,69,maclean street,skeers property,dandenong north,2529,nsw,19380417.0,31.0,03 01783133,6629995
4,A,ce79609d-33b4-423a-bdb4-6a6f0a1572a0,james,colquhoun,118,conlon crescent,,birkdale,5043,nsw,19680112.0,,07 14327140,5350518


In [26]:
display(df_febrl[df_febrl["first_name"].isna()])
display(df_febrl[df_febrl["surname"].isna()])

Unnamed: 0,dataset,person_id,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id


Unnamed: 0,dataset,person_id,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id


## Separate dataset A (original records) and dataset B (dupes)

### Dataset A

In [27]:
df_A = df_febrl[df_febrl["dataset"] == "A"].copy()
df_A.reset_index(inplace=True, drop=True)
df_A = df_A[DATASET_COLUMNS]
df_A.rename(columns={'person_id': 'person_id_A'}, inplace=True)

df_A.to_csv(TRAINING_DATASET_A, index=False)

df_A.head()

Unnamed: 0,person_id_A,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id
0,c4a4d5c9-4bcd-47a4-b621-595638b76717,dakota,geraghty,69,maclean street,skeers property,dandenong north,2529,nsw,19380417.0,31.0,03 01783133,6629995
1,ce79609d-33b4-423a-bdb4-6a6f0a1572a0,james,colquhoun,118,conlon crescent,,birkdale,5043,nsw,19680112.0,,07 14327140,5350518
2,77d0b5c5-0dc5-458f-9eff-af3384e1f86a,ruby,butt,103,,wollartukkee,east fremantle,4814,wa,19430120.0,30.0,02 88839517,3225206
3,ef952695-f044-4dc5-8d91-98692d0fb617,marcus,rees,5,charlick place,lindoran,ballarat,4216,nsw,,27.0,08 17239266,7355062
4,81534f3f-0a8d-4627-b815-4e30f3c44ffd,jassim,belperio,36,john russell circuit,,eastwood,3131,nsw,19460129.0,20.0,02 61510457,9190750


### Dataset B

In [28]:
df_B = df_febrl[df_febrl["dataset"] == "B"].copy()
df_B.reset_index(inplace=True, drop=True)
df_B = df_B[DATASET_COLUMNS]
df_B.rename(columns={'person_id': 'person_id_B'}, inplace=True)

df_B.to_csv(TRAINING_DATASET_B, index=False)

df_B.head()

Unnamed: 0,person_id_B,first_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,age,phone_number,soc_sec_id
0,c954056b-a8a6-4e25-a7a5-2984bcbb874e,charlotte,leukg,301,domain street,locn 1699,alma bay,2710,vic,,29.0,07 05109263,6356142
1,306a9027-bdb1-4cf9-ac34-07fbd744d34a,callie,heerscgap,23,dudi lzce,,mill park,2324,tas,19820623.0,9.0,02 82637596,6775114
2,19b72493-6ebf-47d6-bbec-00eb1e343bde,alanx,nguyen,6,callaghan street,,albury,4575,nsw,19220115.0,27.0,08 82171717,5275665
3,95c7927d-a9d4-4add-8e31-641507891771,willjam,dud,83,purbrick street,glenveagh,muttabrra,6100,,19871212.0,23.0,07 54557966,7073899
4,8b60bfca-ba49-451d-a35f-8d4e6822c8b0,lucy,baillie,34,hurley street,,glen iqnnes,5038,sa,19310448.0,,08 19431835,6880723
