In [None]:
#!pip install splink

In [None]:
#!pip install --upgrade pip

In [44]:
from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb.duckdb_comparison_library import (
    exact_match,
    levenshtein_at_thresholds,
)

import pandas as pd

In [45]:
df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")

In [46]:
df.head(-5)

Unnamed: 0,unique_id,first_name,surname,dob,city,email,group
0,0,Julia,,2015-10-29,London,hannah88@powers.com,0
1,1,Julia,Taylor,2015-07-31,London,hannah88@powers.com,0
2,2,Julia,Taylor,2016-01-27,London,hannah88@powers.com,0
3,3,Julia,Taylor,2015-10-29,,hannah88opowersc@m,0
4,4,oNah,Watson,2008-03-23,Bolton,matthew78@ballard-mcdonald.net,1
...,...,...,...,...,...,...,...
990,990,Sophi,Ellis,1981-10-21,Sheffield,danielle24@garcia-jones.net,178
991,991,liL,Young,1995-09-05,London,wesleykidd@miller.com,179
992,992,ily,Young,1995-09-05,London,wesleykidd@miller.com,179
993,993,,Young,1995-11-09,London,weslcykidd@miler.eom,179


In [47]:
settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        "l.first_name = r.first_name",
        "l.surname = r.surname",
    ],
    "comparisons": [
        levenshtein_at_thresholds("first_name", 2),
        exact_match("surname"),
        exact_match("dob"),
        exact_match("city", term_frequency_adjustments=True),
        exact_match("email"),
    ],
}

In [48]:
linker = DuckDBLinker(df, settings)
linker.estimate_u_using_random_sampling(max_pairs=1e6)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - first_name (no m values are trained).
    - surname (no m values are trained).
    - dob (no m values are trained).
    - city (no m values are trained).
    - email (no m values are trained).


In [49]:
blocking_rule_for_training = "l.first_name = r.first_name and l.surname = r.surname"
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.first_name = r.first_name and l.surname = r.surname

Parameter estimates will be made for the following comparison(s):
    - dob
    - city
    - email

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - first_name
    - surname

Iteration 1: Largest change in params was 0.244 in the m_probability of city, level `All other comparisons`
Iteration 2: Largest change in params was 0.0943 in probability_two_random_records_match
Iteration 3: Largest change in params was 0.0322 in probability_two_random_records_match
Iteration 4: Largest change in params was 0.0158 in probability_two_random_records_match
Iteration 5: Largest change in params was 0.00919 in probability_two_random_records_match
Iteration 6: Largest change in params was 0.00586 in probability_two_random_records_match
Iteration 7: Largest change in params was 0.

<EMTrainingSession, blocking on l.first_name = r.first_name and l.surname = r.surname, deactivating comparisons first_name, surname>

In [50]:
blocking_rule_for_training = "l.dob = r.dob"
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.dob = r.dob

Parameter estimates will be made for the following comparison(s):
    - first_name
    - surname
    - city
    - email

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - dob

Iteration 1: Largest change in params was 0.499 in probability_two_random_records_match
Iteration 2: Largest change in params was 0.285 in probability_two_random_records_match
Iteration 3: Largest change in params was 0.0954 in probability_two_random_records_match
Iteration 4: Largest change in params was 0.0319 in probability_two_random_records_match
Iteration 5: Largest change in params was 0.0145 in probability_two_random_records_match
Iteration 6: Largest change in params was 0.00788 in probability_two_random_records_match
Iteration 7: Largest change in params was 0.00476 in probability_two_random_records_match
Iteration 8: Lar

<EMTrainingSession, blocking on l.dob = r.dob, deactivating comparisons dob>

In [51]:
pairwise_predictions = linker.predict()
pairwise_predictions.as_pandas_dataframe(limit=10)

Unnamed: 0,match_weight,match_probability,unique_id_l,unique_id_r,first_name_l,first_name_r,gamma_first_name,surname_l,surname_r,gamma_surname,dob_l,dob_r,gamma_dob,city_l,city_r,gamma_city,email_l,email_r,gamma_email,match_key
0,-1.313057,0.286971,0,3,Julia,Julia,2,,Taylor,-1,2015-10-29,2015-10-29,1,London,,-1,hannah88@powers.com,hannah88opowersc@m,0,0
1,-4.119211,0.054412,1,3,Julia,Julia,2,Taylor,Taylor,1,2015-07-31,2015-10-29,0,London,,-1,hannah88@powers.com,hannah88opowersc@m,0,0
2,-4.119211,0.054412,2,3,Julia,Julia,2,Taylor,Taylor,1,2016-01-27,2015-10-29,0,London,,-1,hannah88@powers.com,hannah88opowersc@m,0,0
3,-12.447415,0.000179,5,633,Noah,Noah,2,Watson,Gibson,0,2008-03-23,1987-08-16,0,Bolton,Ldnon,0,matthew78@ballard-mcdonald.net,avazquez@banks.com,0,0
4,-12.447415,0.000179,7,633,Noah,Noah,2,Watson,Gibson,0,2008-02-05,1987-08-16,0,tolon,Ldnon,0,matthew78@ballard-mcdonald.net,avazquez@banks.com,0,0
5,-10.791578,0.000564,9,633,Noah,Noah,2,Watson,Gibson,0,2008-01-19,1987-08-16,0,Bolton,Ldnon,0,,avazquez@banks.com,-1,0
6,-9.876103,0.001063,12,633,Noah,Noah,2,,Gibson,-1,2008-03-23,1987-08-16,0,Blotn,Ldnon,0,,avazquez@banks.com,-1,0
7,-9.371038,0.001508,16,948,Amelia,Amelia,2,Alexander,Johnson,0,1983-05-19,1985-05-04,0,Glaogw,,-1,,olsondanielle@martinez.com,-1,0
8,-11.026874,0.000479,17,948,Amelia,Amelia,2,Alexander,Johnson,0,1983-04-30,1985-05-04,0,Glasgow,,-1,icampbeal@lllen-lews.org,olsondanielle@martinez.com,0,0
9,-11.026874,0.000479,18,948,Amelia,Amelia,2,Alexander,Johnson,0,1983-05-19,1985-05-04,0,Glasgow,,-1,icampbell@allen-lewis.org,olsondanielle@martinez.com,0,0


In [52]:
clusters = linker.cluster_pairwise_predictions_at_threshold(pairwise_predictions, 0.95)
clusters.as_pandas_dataframe(limit=50)

Completed iteration 1, root rows count 10
Completed iteration 2, root rows count 1
Completed iteration 3, root rows count 0


Unnamed: 0,cluster_id,unique_id,first_name,surname,dob,city,email,group,tf_city
0,0,0,Julia,,2015-10-29,London,hannah88@powers.com,0,0.28844
1,1,1,Julia,Taylor,2015-07-31,London,hannah88@powers.com,0,0.28844
2,1,2,Julia,Taylor,2016-01-27,London,hannah88@powers.com,0,0.28844
3,3,3,Julia,Taylor,2015-10-29,,hannah88opowersc@m,0,
4,4,4,oNah,Watson,2008-03-23,Bolton,matthew78@ballard-mcdonald.net,1,0.015713
5,4,5,Noah,Watson,2008-03-23,Bolton,matthew78@ballard-mcdonald.net,1,0.015713
6,6,6,Watson,Noah,2008-03-23,,matthew78@ballard-mcdonald.net,1,
7,7,7,Noah,Watson,2008-02-05,tolon,matthew78@ballard-mcdonald.net,1,0.001122
8,6,8,Watson,Noah,2008-06-15,Bolton,matthew78@ballard-mcdonald.net,1,0.015713
9,9,9,Noah,Watson,2008-01-19,Bolton,,1,0.015713
