In [4]:
import pandas as pd
import os
from splink import Linker, SettingsCreator, block_on, DuckDBAPI
import splink.comparison_library as cl
from splink.exploratory import completeness_chart, profile_columns

#Loading the sample dataset 
file_path = '../data/febrl_sample_dataset.csv'
df = pd.read_csv(file_path)
df.head()


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,unique_id
0,,waller,6.0,tullaroop street,willaroo,st james,4011,wa,19081209.0,6988048
1,lachlan,berry,69.0,giblin street,killarney,bittern,4814,qld,19990219.0,7364009
2,deakin,sondergeld,48.0,goldfinch circuit,kooltuo,canterbury,2776,vic,19600210.0,2635962
3,kayla,harrington,,maltby circuit,coaling,coolaroo,3465,nsw,19150612.0,9004242
4,luke,purdon,23.0,ramsay place,mirani,garbutt,2260,vic,19831024.0,8099933


In [None]:
# checking the completness of each column
db_api = DuckDBAPI()
completeness_chart(df, db_api=db_api)

In [6]:
#runing exploratory analysis for sample dataset 
profile_columns(df, db_api=DuckDBAPI(), top_n=10, bottom_n=5)

In [21]:
#choosing blocking rules to optimise runtime
from splink.blocking_analysis import (
    cumulative_comparisons_to_be_scored_from_blocking_rules_chart,
)

blocking_rules_for_analysis = [
    block_on("substr(given_name, 1,1)", "surname"),
    block_on("surname"),
    block_on("postcode"),
    "l.given_name = r.given_name and levenshtein(l.surname, r.surname) < 2",
]


cumulative_comparisons_to_be_scored_from_blocking_rules_chart(
    table_or_tables=df,
    blocking_rules=blocking_rules_for_analysis,
    db_api=db_api,
    link_type="dedupe_only",
)

In [22]:
# Configure Splink settings with choosen blocking rule
settings = SettingsCreator(
    link_type="dedupe_only",  # Deduplication mode
    blocking_rules_to_generate_predictions=[
        block_on("surname"),  
        block_on("postcode"),
        block_on("given_name")  
    ],
    comparisons=[
        cl.NameComparison("given_name").configure(term_frequency_adjustments=True),
        cl.NameComparison("surname").configure(term_frequency_adjustments=True),
        cl.ExactMatch("date_of_birth"),
        cl.ExactMatch("postcode").configure(term_frequency_adjustments=True),
        cl.ExactMatch("unique_id"), 
        cl.ExactMatch("state"),  
        cl.LevenshteinAtThresholds("address_1", [2, 5, 10]),
    ],
    retain_intermediate_calculation_columns=True,
)

In [12]:
#Initialize the Linker
db_api = DuckDBAPI()
linker = Linker(df, settings, db_api=db_api)

In [None]:
#  Train the Splink model to u and m parameter for the model
deterministic_rules = [
    "l.surname = r.surname AND l.postcode = r.postcode AND levenshtein(l.given_name, r.given_name) <= 1",
    "l.unique_id = r.unique_id",
]
linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.8)
linker.training.estimate_u_using_random_sampling(max_pairs=1e6, seed=5)


linker.training.estimate_parameters_using_expectation_maximisation(
    block_on("surname")
)
linker.training.estimate_parameters_using_expectation_maximisation(
    block_on("postcode")
)



Probability two random records match is estimated to be  4.75e-05.
This means that amongst all possible pairwise record comparisons, one in 21,031.58 are expected to match.  With 499,500 total possible comparisons, we expect a total of around 23.75 matching pairs
You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.
----- Estimating u probabilities using random sampling -----
u probability not trained for unique_id - Exact match on unique_id (comparison vector value: 1). This usually means the comparison level was never observed in the training data.

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - given_name (no m values are trained).
    - surname (no m values are trained).
    - date_of_birth (no m values are trained).
    -

<EMTrainingSession, blocking on l."postcode" = r."postcode", deactivating comparisons postcode>

In [None]:
# Run predictions and visuallising predictions with waterfall chart 
df_predictions = linker.inference.predict(threshold_match_probability=0.9)

records_to_view = df_predictions.as_record_dict(limit=5)
linker.visualisations.waterfall_chart(records_to_view, filter_nulls=False)

Blocking time: 0.02 seconds


Predict time: 0.25 seconds

You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'unique_id':
    m values not fully trained
Comparison: 'unique_id':
    u values not fully trained


In [None]:
#Cluster duplicates 
clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(
    df_predictions, threshold_match_probability=0.9
)

#Save deduplicated results to datafream
df_clusters = clusters.as_pandas_dataframe()
df_clusters

Completed iteration 1, num representatives needing updating: 0


Unnamed: 0,cluster_id,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,unique_id
0,7364009,lachlan,berry,69.0,giblin street,killarney,bittern,4814,qld,19990219.0,7364009
1,2635962,deakin,sondergeld,48.0,goldfinch circuit,kooltuo,canterbury,2776,vic,19600210.0,2635962
2,2025650,darcie,turtur,10.0,blacket street,eureka,beverly hills,2263,nsw,,2025650
3,7661096,abbey,fit,13.0,kosciusko avenue,the wharf complex,yass,2594,nsw,19870510.0,7661096
4,2790666,isabella,lodder,156.0,messenger street,tongbong sanctuary,bayswater,4870,vic,19650714.0,2790666
...,...,...,...,...,...,...,...,...,...,...,...
995,9162672,,hefford,1.0,armstrong crescent,,payneham,5502,vic,19440805.0,9162672
996,9162672,,cogzell,1.0,armstrong crescent,cromdale,payneham,5502,vic,19440905.0,9162672
997,6794161,,durr,11.0,cochrane crescent,the rookery,south perth,2795,wa,19300521.0,6794611
998,9128171,nacoya,lowe,381.0,sherbrooke street,bright view,pimlico,2261,qld,19050204.0,9128171


In [23]:
#sorting clusterd datafream with cluster_id
df_clusters_sorted = df_clusters.sort_values(by='cluster_id', ascending=True)
df_clusters_sorted.head(40)



Unnamed: 0,cluster_id,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,unique_id
959,1023877,michael,belci,1.0,chisholm street,carrington garden,torquay,5085,sa,19300418.0,1023877
958,1023877,michael,belci,1.0,chisholmstreet,carrington garden,torquay,5085,sa,19300418.0,1023877
845,1025941,evangelia,,3335.0,naas road,wyncliffe,gwandalan,2060,act,19820125.0,1025941
842,1025941,evangelia,,4.0,naas rrad,wync liffe,gwandalan,2006,act,19820125.0,1025941
765,1030443,jacob,simmonds,54.0,jukes street,wee wilbertree,bunbury,7264,wa,19580404.0,1030443
754,1030443,jacob,simmonds,54.0,jukes stcreet,,bunbury,7264,wa,19580404.0,1030443
178,1041588,james,girdler,11.0,russell drysdale crescent,rowethorpe,mansfield,4551,tas,19420915.0,1041588
263,1041588,james,girdler,11.0,russell drysdale crescent,rowet hlrpe,mansfirld,4551,tas,19420915.0,1041588
180,1052176,caitlin,bishop,186.0,dolling crescent,clifden,tarana,3121,nsw,19721009.0,1052176
629,1052176,hollie,bishop,186.0,dolling crescent,clifden,tarana,3121,nsw,19721009.0,1052176
