In [21]:
from pipeline.util.platform import start_spark, start_spark_local
import transform
from pyspark.ml import Pipeline
from pyspark.ml.feature import StopWordsRemover, Tokenizer, NGram, HashingTF, MinHashLSH, RegexTokenizer, SQLTransformer
from pyspark.sql import functions as F

In [2]:
spark = start_spark_local(app_name='addessmatch_etl_job')

In [9]:
df_abmatch = transform.read_parquet(spark, '../../../data/in/addressbase')

In [22]:
#df_abmatch.printSchema()
df_abmatch.count()

51813

In [24]:
model = Pipeline(stages=[
    SQLTransformer(statement="SELECT *, lower(ADDRESS) lower FROM __THIS__"),
    Tokenizer(inputCol="lower", outputCol="token"),
    StopWordsRemover(inputCol="token", outputCol="stop"),
    SQLTransformer(statement="SELECT *, concat_ws(' ', stop) concat FROM __THIS__"),
    RegexTokenizer(pattern="", inputCol="concat", outputCol="char", minTokenLength=1),
    NGram(n=2, inputCol="char", outputCol="ngram"),
    HashingTF(inputCol="ngram", outputCol="vector"),
    MinHashLSH(inputCol="vector", outputCol="lsh", numHashTables=3)
]).fit(df_abmatch)

df_abmatchtrans = model.transform(df_abmatch)
df_abmatchtrans = df_abmatchtrans.filter(F.size(F.col("ngram")) > 0)
print(f"Example transformation ({df_abmatchtrans.count()} addresses left):")

Example transformation (51813 addresses left):


In [25]:
df_bldadr = transform.read_parquet(spark, '../../../data/in/buildingaddress')

In [26]:
df_bldadr.count()

10

In [27]:
# Use pipeline previous defined
df_bldadrtrans = model.transform(df_bldadr)
df_bldadrtrans = df_bldadrtrans.filter(F.size(F.col("ngram")) > 0)
print(f"Example transformation ({df_bldadrtrans.count()} movies left):")

Example transformation (10 movies left):


In [28]:
df_bldadrtrans.show(truncate=False)

+---+--------------------------------------------------------------------------+--------------------------------------------------------------------------+---------------------------------------------------------------------------------------+-------------------------------------------------------------------------------+--------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------

In [29]:
result = model.stages[-1].approxSimilarityJoin(df_bldadrtrans, df_abmatchtrans, 0.5, "jaccardDist")
print(f"{result.count()} matches")
# (result
#  .select('datasetA.id', 'datasetA.Title', 'datasetB.Title', 'jaccardDist')
#  .sort(F.col('datasetA.id'))
#  .show(20, True))

190 matches


In [38]:
(result
    .select('datasetA.id','datasetA.address','datasetB.UPRN','datasetB.ADDRESS','jaccardDist')
    .sort('datasetA.id','jaccardDist')
    .show(n=200,truncate=False))

+---+--------------------------------------------------------------------------+------------+-------------------------------------------------------------------------------------------+-------------------+
|id |address                                                                   |UPRN        |ADDRESS                                                                                    |jaccardDist        |
+---+--------------------------------------------------------------------------+------------+-------------------------------------------------------------------------------------------+-------------------+
|1  |The South Lawn Medical Practice, S Lawn Terrace, Heavitree, Exeter EX1 2RX|100041046145|HEAVITREE HEALTH CENTRE SOUTH LAWN TERRACE EXETER DEVON EX1 2RX                            |0.39344262295081966|
|1  |The South Lawn Medical Practice, S Lawn Terrace, Heavitree, Exeter EX1 2RX|100041225627|HEAVITREE SCHOOL HOUSE DENTAL PRACTICE SCHOOL HOUSE SOUTH LAWN TERRACE EXETER DEVON

In [33]:
result.printSchema()

root
 |-- datasetA: struct (nullable = false)
 |    |-- id: integer (nullable = true)
 |    |-- address: string (nullable = true)
 |    |-- lower: string (nullable = true)
 |    |-- token: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- stop: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- concat: string (nullable = false)
 |    |-- char: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- ngram: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- vector: vector (nullable = true)
 |    |-- lsh: array (nullable = true)
 |    |    |-- element: vector (containsNull = true)
 |-- datasetB: struct (nullable = false)
 |    |-- UPRN: long (nullable = true)
 |    |-- OUTCODE: string (nullable = true)
 |    |-- LOCALITY: string (nullable = true)
 |    |-- ADDRESS: string (nullable = true)
 |    |-- lower: string (nullable = true)
 |    |-- token: arr

In [10]:
sqlTrans = SQLTransformer(statement="SELECT *, lower(ADDRESS) lower FROM __THIS__")
df1 = sqlTrans.transform(df_abmatch)
tokenizer = Tokenizer(inputCol="lower", outputCol="token")
df2 = tokenizer.transform(df1)
remover = StopWordsRemover(inputCol="token", outputCol="stop")
df3 = remover.transform(df2)
sqlTrans = SQLTransformer(statement="SELECT *, concat_ws(' ', stop) concat FROM __THIS__")
df4 = sqlTrans.transform(df3)
rtokenizer = RegexTokenizer(pattern="", inputCol="concat", outputCol="char", minTokenLength=1)
df5 = rtokenizer.transform(df4)
ngram = NGram(n=2, inputCol="char", outputCol="ngram")
df6 = ngram.transform(df5)
hashtf = HashingTF(inputCol="ngram", outputCol="vector")
df7 = hashtf.transform(df6)
minhash = MinHashLSH(inputCol="vector", outputCol="lsh", numHashTables=3)
model = minhash.fit(df7)
model.setInputCol("vector")

In [20]:
df6.show(truncate=False)

+------------+-------+---------+----------------------------------------------+----------------------------------------------+-------------------------------------------------------+-------------------------------------------------------+----------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|UPRN        |OUTCODE|LOCALITY |ADDRESS                                       |lower                                         |token                                                  |stop                                                   |concat                                        |char                                                            