In [9]:
from splink.datasets import splink_datasets
from splink.duckdb.linker import DuckDBLinker
import altair as alt
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
import pandas as pd 
pd.options.display.max_rows = 1000

sparkdriver = SparkSession.builder.master('local').appName('demoapp') \
    .config('spark.jars.packages', 'com.microsoft.sqlserver:mssql-jdbc:9.4.1.jre8') \
    .getOrCreate()

df = sparkdriver.read.format('jdbc') \
    .option('url', 'jdbc:sqlserver://localhost:47777;databaseName=splink') \
    .option('driver', 'com.microsoft.sqlserver.jdbc.SQLServerDriver') \
    .option('user', 'datahubadmin') \
    .option('password', 'datahub') \
    .option('query', 'select * from dbo.inputfull') \
    .load()

sparkdriver.udf.registerJavaFunction('jaro_winkler', 'uk.gov.moj.dash.linkage.JaroWinklerSimilarity', DoubleType())


df.show(5)


+----------+------------+--------------------+-----------------+----------+---------+----------+-----------+-------------+------+----------+
| unique_id|     cluster|           full_name|first_and_surname|first_name|  surname|       dob|birth_place|postcode_fake|gender|occupation|
+----------+------------+--------------------+-----------------+----------+---------+----------+-----------+-------------+------+----------+
|Q2296770-1|2296770.0000|thomas clifford, ...| thomas chudleigh|    thomas|chudleigh|1630-08-01|      devon|     tq13 8df|  male|politician|
|Q2296770-2|2296770.0000| thomas of chudleigh| thomas chudleigh|    thomas|chudleigh|1630-08-01|      devon|     tq13 8df|  male|politician|
|Q2296770-3|2296770.0000|tom 1st baron cli...|    tom chudleigh|       tom|chudleigh|1630-08-01|      devon|     tq13 8df|  male|politician|
|Q2296770-4|2296770.0000|thomas 1st chudleigh| thomas chudleigh|    thomas|chudleigh|1630-08-01|      devon|     tq13 8hu|  NULL|politician|
|Q2296770-5|2

In [10]:

from splink.spark.blocking_rule_library import block_on

# Simple settings dictionary will be used for exploratory analysis
settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        block_on(["first_name", "surname"]),
        block_on(["surname", "dob"]),
        block_on(["first_name", "dob"]),
        block_on(["postcode_fake", "first_name"]),
    ],
}

from splink.spark.linker import SparkLinker
linker = SparkLinker(df, settings, spark=sparkdriver)

linker.profile_columns(
    ["first_name", "postcode_fake", "substr(dob, 1,4)"], top_n=10, bottom_n=5
)

linker.cumulative_num_comparisons_from_blocking_rules_chart()



In [11]:

import splink.spark.comparison_template_library as ctl
import splink.spark.comparison_library as cl

settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        block_on(["first_name", "surname"]),
        block_on(["surname", "dob"]),
        block_on(["first_name", "dob"]),
        block_on(["postcode_fake", "first_name"]),
    ],
    "comparisons": [
        ctl.name_comparison("first_name", term_frequency_adjustments=True),
        ctl.name_comparison("surname", term_frequency_adjustments=True),
        ctl.date_comparison("dob", cast_strings_to_date=True, invalid_dates_as_null=True),
        ctl.postcode_comparison("postcode_fake"),
        cl.exact_match("birth_place", term_frequency_adjustments=True),
        cl.exact_match("occupation",  term_frequency_adjustments=True),
    ],
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
    "max_iterations": 10,
    "em_convergence": 0.01
}


In [12]:
import splink.spark.comparison_library as cl
import splink.spark.comparison_template_library as ctl
from splink.spark.blocking_rule_library import block_on

settings = {
    "link_type": "dedupe_only",
    "comparisons": [
        ctl.name_comparison("first_name"),
        ctl.name_comparison("surname"),
        ctl.date_comparison("dob", cast_strings_to_date=True),
        cl.exact_match("birth_place", term_frequency_adjustments=True),
    ],
    "blocking_rules_to_generate_predictions": [
        block_on("first_name"),
        "l.surname = r.surname",  # alternatively, you can write BRs in their SQL form
    ],
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
    "em_convergence": 0.01
}

In [13]:
from splink.spark.linker import SparkLinker
linker = SparkLinker(df, settings, spark=sparkdriver)
deterministic_rules = [
    "l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1",
    "l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1",
    "l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2"]

sc = sparkdriver.sparkContext  # Access the SparkContext
sc.setCheckpointDir("C:/Users/seanj/Documents/MyProjects/Splink")  # Set checkpoint directory


linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.6)
linker.estimate_u_using_random_sampling(max_pairs=5e5)


--WARN-- 
 You are using datediff comparison
                        with str-casting and ANSI is not enabled. Bad dates
                        e.g. 1999-13-54 will not trigger an exception but will
                        classed as comparison level = "ELSE". Ensure date strings
                        are cleaned to remove bad dates 

Probability two random records match is estimated to be  0.000771.
This means that amongst all possible pairwise record comparisons, one in 1,297.74 are expected to match.  With 1,279,041,753 total possible comparisons, we expect a total of around 985,588.33 matching pairs
----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - first_name (no m values are trained).
    - surname (no m values are trained).
    - dob (no m values are trained).
    - birth_place (no m values are trained).


In [14]:

training_blocking_rule = "l.first_name = r.first_name and l.surname = r.surname"
training_session_fname_sname = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule)

training_blocking_rule = "l.dob = r.dob"
training_session_dob = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule)


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.first_name = r.first_name and l.surname = r.surname

Parameter estimates will be made for the following comparison(s):
    - dob
    - birth_place

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - first_name
    - surname

Iteration 1: Largest change in params was -0.512 in the m_probability of dob, level `Exact match`
Iteration 2: Largest change in params was -0.0845 in probability_two_random_records_match
Iteration 3: Largest change in params was -0.0569 in the m_probability of birth_place, level `All other comparisons`
Iteration 4: Largest change in params was 0.0555 in the m_probability of birth_place, level `Exact match`
Iteration 5: Largest change in params was -0.0508 in the m_probability of birth_place, level `All other comparisons`
Iteration 6: Largest change in params was 0.0422 in the m_probability of bir

In [15]:
results = linker.predict(threshold_match_probability=0.9)


In [16]:
df_e = results.as_pandas_dataframe()
df_e.to_csv('OutputFull.csv')

print("Results written to results.csv successfully!")

Results written to results.csv successfully!
