In [1]:
from pipeline.util.platform import start_spark, start_spark_local
from pipeline.util.storage import read_parquet, write_parquet, write_repart_parquet
from pyspark.ml import Pipeline
from pyspark.ml.feature import StopWordsRemover, Tokenizer, NGram, HashingTF, MinHashLSH, RegexTokenizer, SQLTransformer
from pyspark.sql import functions as F

In [2]:
spark, loggrer, conf = start_spark(app_name='addessmatch_etl_job')

local


In [3]:
df_abmatch = read_parquet(spark, "{storage_curated}/data/in/addressbase".format(storage_curated = spark.conf.get('storage.curated')))

In [4]:
model = Pipeline(stages=[
    SQLTransformer(statement="SELECT *, lower(ADDRESS) lower FROM __THIS__"),
    Tokenizer(inputCol="lower", outputCol="token"),
    StopWordsRemover(inputCol="token", outputCol="stop"),
    SQLTransformer(statement="SELECT *, concat_ws(' ', stop) concat FROM __THIS__"),
    RegexTokenizer(pattern="", inputCol="concat", outputCol="char", minTokenLength=1),
    NGram(n=2, inputCol="char", outputCol="ngram"),
    HashingTF(inputCol="ngram", outputCol="vector"),
    MinHashLSH(inputCol="vector", outputCol="lsh", numHashTables=3)
]).fit(df_abmatch)

df_abmatchtrans = model.transform(df_abmatch)
df_abmatchtrans = df_abmatchtrans.filter(F.size(F.col("ngram")) > 0)
print(f"Example transformation ({df_abmatchtrans.count()} addresses left):")

Example transformation (51813 addresses left):


In [5]:
df_bldadr = read_parquet(spark, "{storage_curated}/data/in/buildingaddress".format(storage_curated = spark.conf.get('storage.curated')))

In [6]:
# Use pipeline previous defined
df_bldadrtrans = model.transform(df_bldadr)
df_bldadrtrans = df_bldadrtrans.filter(F.size(F.col("ngram")) > 0)
print(f"Example transformation ({df_bldadrtrans.count()} buildings left):")

Example transformation (10 buildings left):


In [28]:
#df_bldadrtrans.show(truncate=False)

+---+--------------------------------------------------------------------------+--------------------------------------------------------------------------+---------------------------------------------------------------------------------------+-------------------------------------------------------------------------------+--------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------

In [7]:
result = model.stages[-1].approxSimilarityJoin(df_bldadrtrans, df_abmatchtrans, 0.5, "jaccardDist")
print(f"{result.count()} matches")

299 matches


In [8]:
write_parquet(spark, result, "{storage_curated}/data/in/addressmatch".format(storage_curated = spark.conf.get('storage.curated')))

In [9]:
(result
    .select('datasetA.id','datasetA.address','datasetB.UPRN','datasetB.ADDRESS','jaccardDist')
    .sort('datasetA.id','jaccardDist')
    .show(n=302,truncate=False))

+---+--------------------------------------------------------------------------+------------+-------------------------------------------------------------------------------------------+-------------------+
|id |address                                                                   |UPRN        |ADDRESS                                                                                    |jaccardDist        |
+---+--------------------------------------------------------------------------+------------+-------------------------------------------------------------------------------------------+-------------------+
|1  |The South Lawn Medical Practice, S Lawn Terrace, Heavitree, Exeter EX1 2RX|100041046145|HEAVITREE HEALTH CENTRE SOUTH LAWN TERRACE EXETER DEVON EX1 2RX                            |0.39344262295081966|
|1  |The South Lawn Medical Practice, S Lawn Terrace, Heavitree, Exeter EX1 2RX|100041225627|HEAVITREE SCHOOL HOUSE DENTAL PRACTICE SCHOOL HOUSE SOUTH LAWN TERRACE EXETER DEVON

In [17]:
#result.select('datasetA.id').distinct().count()
result.select('datasetA.id').distinct().show(truncate=False)

+---+
|id |
+---+
|9  |
|4  |
|7  |
|10 |
|1  |
|6  |
|5  |
+---+



In [18]:
#df_abmatch.printSchema()
#df_abmatch.count()
#dft = spark.table("mytable")
#spark.sql("show tables").show()
df_abmatch.printSchema()
#df_abmatch.

root
 |-- UPRN: long (nullable = true)
 |-- OUTCODE: string (nullable = true)
 |-- LOCALITY: string (nullable = true)
 |-- ADDRESS: string (nullable = true)



In [10]:
# sqlTrans = SQLTransformer(statement="SELECT *, lower(ADDRESS) lower FROM __THIS__")
# df1 = sqlTrans.transform(df_abmatch)
# tokenizer = Tokenizer(inputCol="lower", outputCol="token")
# df2 = tokenizer.transform(df1)
# remover = StopWordsRemover(inputCol="token", outputCol="stop")
# df3 = remover.transform(df2)
# sqlTrans = SQLTransformer(statement="SELECT *, concat_ws(' ', stop) concat FROM __THIS__")
# df4 = sqlTrans.transform(df3)
# rtokenizer = RegexTokenizer(pattern="", inputCol="concat", outputCol="char", minTokenLength=1)
# df5 = rtokenizer.transform(df4)
# ngram = NGram(n=2, inputCol="char", outputCol="ngram")
# df6 = ngram.transform(df5)
# hashtf = HashingTF(inputCol="ngram", outputCol="vector")
# df7 = hashtf.transform(df6)
# minhash = MinHashLSH(inputCol="vector", outputCol="lsh", numHashTables=3)
# model = minhash.fit(df7)
# model.setInputCol("vector")

In [None]:
# df6.show(truncate=False)