In [None]:
from splink.datasets import splink_datasets
from splink.duckdb.linker import DuckDBLinker
import altair as alt
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
import pandas as pd 
pd.options.display.max_rows = 1000

sparkdriver = SparkSession.builder.master('local').appName('demoapp') \
    .config('spark.jars.packages', 'com.microsoft.sqlserver:mssql-jdbc:9.4.1.jre8') \
    .getOrCreate()

df_1 = sparkdriver.read.format('jdbc') \
    .option('url', 'jdbc:sqlserver://localhost:47777;databaseName=splink') \
    .option('driver', 'com.microsoft.sqlserver.jdbc.SQLServerDriver') \
    .option('user', 'datahubadmin') \
    .option('password', 'datahub') \
    .option('query', 'select * from dbo.inputfull_1') \
    .load()

df_2 = sparkdriver.read.format('jdbc') \
    .option('url', 'jdbc:sqlserver://localhost:47777;databaseName=splink') \
    .option('driver', 'com.microsoft.sqlserver.jdbc.SQLServerDriver') \
    .option('user', 'datahubadmin') \
    .option('password', 'datahub') \
    .option('query', 'select * from dbo.inputfull_2') \
    .load()

sparkdriver.udf.registerJavaFunction('jaro_winkler', 'uk.gov.moj.dash.linkage.JaroWinklerSimilarity', DoubleType())



from splink.spark.blocking_rule_library import block_on

# Simple settings dictionary will be used for exploratory analysis
settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        block_on(["first_name", "surname"]),
        block_on(["surname", "dob"]),
        block_on(["first_name", "dob"]),
        block_on(["postcode_fake", "first_name"]),
    ],
}

from splink.spark.linker import SparkLinker
linker = SparkLinker([df_1, df_2], settings, spark=sparkdriver)

linker.profile_columns(
    ["first_name", "postcode_fake", "substr(dob, 1,4)"], top_n=10, bottom_n=5
)

linker.cumulative_num_comparisons_from_blocking_rules_chart()

import splink.spark.comparison_template_library as ctl
import splink.spark.comparison_library as cl

settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        block_on(["first_name", "surname"]),
        block_on(["surname", "dob"]),
        block_on(["first_name", "dob"]),
        block_on(["postcode_fake", "first_name"]),
    ],
    "comparisons": [
        ctl.name_comparison("first_name", term_frequency_adjustments=True),
        ctl.name_comparison("surname", term_frequency_adjustments=True),
        ctl.date_comparison("dob", cast_strings_to_date=True, invalid_dates_as_null=True),
        ctl.postcode_comparison("postcode_fake"),
        cl.exact_match("birth_place", term_frequency_adjustments=True),
        cl.exact_match("occupation",  term_frequency_adjustments=True),
    ],
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
    "max_iterations": 10,
    "em_convergence": 0.01
}
import splink.spark.comparison_library as cl
import splink.spark.comparison_template_library as ctl
from splink.spark.blocking_rule_library import block_on

settings = {
    "link_type": "dedupe_only",
    "comparisons": [
        ctl.name_comparison("first_name"),
        ctl.name_comparison("surname"),
        ctl.date_comparison("dob", cast_strings_to_date=True),
        cl.exact_match("birth_place", term_frequency_adjustments=True),
    ],
    "blocking_rules_to_generate_predictions": [
        block_on("first_name"),
        "l.surname = r.surname",  # alternatively, you can write BRs in their SQL form
    ],
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
    "em_convergence": 0.01
}
from splink.spark.linker import SparkLinker
linker = SparkLinker([df_1, df_2], settings, spark=sparkdriver)
deterministic_rules = [
    "l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1",
    "l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1",
    "l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2"]

sc = sparkdriver.sparkContext  # Access the SparkContext
sc.setCheckpointDir("C:/Users/seanj/Documents/MyProjects/Splink")  # Set checkpoint directory


linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.6)
linker.estimate_u_using_random_sampling(max_pairs=5e5)

training_blocking_rule = "l.first_name = r.first_name and l.surname = r.surname"
training_session_fname_sname = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule)

training_blocking_rule = "l.dob = r.dob"
training_session_dob = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule)
results = linker.predict(threshold_match_probability=0.9)
df_e = results.as_pandas_dataframe()
df_e.to_csv('OutputFull.csv')

print("Results written to results.csv successfully!")