In [1]:
from pipeline.util.platform import start_spark, start_spark_local
from pipeline.util.storage import read_parquet, write_parquet, write_repart_parquet
from pyspark.ml import Pipeline
from pyspark.ml.feature import StopWordsRemover, Tokenizer, NGram, HashingTF, MinHashLSH, RegexTokenizer, SQLTransformer
from pyspark.sql import functions as F
from pyspark.sql import window as W

In [2]:
spark, loggrer, conf = start_spark(app_name='addessmatch_etl_job')

local


In [3]:
df_abmatch = read_parquet(spark, "{storage_curated}/data/in/addressbase".format(storage_curated = spark.conf.get('storage.curated')))

In [4]:
model = Pipeline(stages=[
    SQLTransformer(statement="SELECT *, lower(ADDRESS) lower FROM __THIS__"),
    Tokenizer(inputCol="lower", outputCol="token"),
    StopWordsRemover(inputCol="token", outputCol="stop"),
    SQLTransformer(statement="SELECT *, concat_ws(' ', stop) concat FROM __THIS__"),
    RegexTokenizer(pattern="", inputCol="concat", outputCol="char", minTokenLength=1),
    NGram(n=2, inputCol="char", outputCol="ngram"),
    HashingTF(inputCol="ngram", outputCol="vector"),
    MinHashLSH(inputCol="vector", outputCol="lsh", numHashTables=3)
]).fit(df_abmatch)

df_abmatchtrans = model.transform(df_abmatch)
df_abmatchtrans = df_abmatchtrans.filter(F.size(F.col("ngram")) > 0)
print(f"Example transformation ({df_abmatchtrans.count()} addresses left):")

Example transformation (51813 addresses left):


In [5]:
df_bldadr = read_parquet(spark, "{storage_curated}/data/in/buildingaddress".format(storage_curated = spark.conf.get('storage.curated')))

In [6]:
# Use pipeline previous defined
df_bldadrtrans = model.transform(df_bldadr)
df_bldadrtrans = df_bldadrtrans.filter(F.size(F.col("ngram")) > 0)
print(f"Example transformation ({df_bldadrtrans.count()} buildings left):")

Example transformation (331 buildings left):


In [7]:
df_match = model.stages[-1].approxSimilarityJoin(df_bldadrtrans, df_abmatchtrans, 0.5, "jaccardDist")
print(f"{df_match.count()} matches")

4810 matches


In [8]:
owin = W.Window.partitionBy(df_match.datasetA.id).orderBy(df_match.jaccardDist)
df_match = df_match.withColumn("oid",F.row_number().over(owin))

In [9]:
df_matchres = (df_match
    .select('datasetA.id',F.col('datasetA.address').alias("buildingaddress"),'datasetB.UPRN','jaccardDist','oid')
    .filter(F.col("oid")==1))

In [12]:
write_parquet(spark, df_matchres, "{storage_curated}/data/in/addressmatchres".format(storage_curated = spark.conf.get('storage.curated')))

In [11]:
#df_abmatch.printSchema()
#df_matchres.count()
#dft = spark.table("mytable")
#spark.sql("show tables").show()
#df_matchres.show(truncate=False)
#df_match.filter(F.col("datasetA.id")==4).sort(F.col("jaccardDist").desc()).show(n=200,truncate=False)

+---+--------------------------------------------------------------------------+------------+-------------------+---+
|id |buildingaddress                                                           |UPRN        |jaccardDist        |oid|
+---+--------------------------------------------------------------------------+------------+-------------------+---+
|1  |The South Lawn Medical Practice, S Lawn Terrace, Heavitree, Exeter EX1 2RX|100041046145|0.39344262295081966|1  |
|4  |FLAT 52D MORTIMER HOUSE GRENDON ROAD EX1 2NL                              |100040215247|0.1428571428571429 |1  |
|5  |Unit 3 The Exebridge Centre Exeter EX4 1AH                                |10013043126 |0.40476190476190477|1  |
|6  |30, Guildhall Shopping Centre, Exeter EX4 3HJ                             |10013043352 |0.375              |1  |
|7  |Market St, Exeter EX1 1BW                                                 |100041123913|0.3666666666666667 |1  |
|9  |36 Southernhay E, Exeter EX1 1NX                   

In [10]:
# sqlTrans = SQLTransformer(statement="SELECT *, lower(ADDRESS) lower FROM __THIS__")
# df1 = sqlTrans.transform(df_abmatch)
# tokenizer = Tokenizer(inputCol="lower", outputCol="token")
# df2 = tokenizer.transform(df1)
# remover = StopWordsRemover(inputCol="token", outputCol="stop")
# df3 = remover.transform(df2)
# sqlTrans = SQLTransformer(statement="SELECT *, concat_ws(' ', stop) concat FROM __THIS__")
# df4 = sqlTrans.transform(df3)
# rtokenizer = RegexTokenizer(pattern="", inputCol="concat", outputCol="char", minTokenLength=1)
# df5 = rtokenizer.transform(df4)
# ngram = NGram(n=2, inputCol="char", outputCol="ngram")
# df6 = ngram.transform(df5)
# hashtf = HashingTF(inputCol="ngram", outputCol="vector")
# df7 = hashtf.transform(df6)
# minhash = MinHashLSH(inputCol="vector", outputCol="lsh", numHashTables=3)
# model = minhash.fit(df7)
# model.setInputCol("vector")