In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Lab3").getOrCreate()

In [6]:
from pyspark.sql.functions import col, lower, regexp_replace
from pyspark.ml.feature import Tokenizer

df = spark.read.csv('data.csv',header=True)

df = df.withColumn("name", lower(col("name")))
df = df.withColumn("address", lower(col("address")))

df = df.withColumn("address", regexp_replace(col("address"), "[^a-z0-9\\s]", ""))

tokenizer = Tokenizer(inputCol="name", outputCol="name_tokens")
df = tokenizer.transform(df)

tokenizer_address = Tokenizer(inputCol="address", outputCol="address_tokens")
df = tokenizer_address.transform(df)

df.show()

+---+------------------+--------------------+--------------------+--------------------+
| id|              name|             address|         name_tokens|      address_tokens|
+---+------------------+--------------------+--------------------+--------------------+
|  1|     kellia corwin|    9920 welch place|    [kellia, corwin]|[9920, welch, place]|
|  2|     brock sealove|  496 warbler avenue|    [brock, sealove]|[496, warbler, av...|
|  3|    alberta leguay|    72 sommers court|   [alberta, leguay]|[72, sommers, court]|
|  4|     wendell mobbs|54569 manufacture...|    [wendell, mobbs]|[54569, manufactu...|
|  5|   marjie allpress|96999 hoard crossing|  [marjie, allpress]|[96999, hoard, cr...|
|  6|  jeremiah belliss|      6 manley place| [jeremiah, belliss]|  [6, manley, place]|
|  7|  allison bretelle|    1 towne crossing| [allison, bretelle]|[1, towne, crossing]|
|  8| allistir berthome|23 lake view terrace|[allistir, berthome]|[23, lake, view, ...|
|  9|       dulcie line|      5 

In [7]:
from pyspark.sql.functions import expr

parsed = spark.read.option("header", "true").option("nullValue", "?").\
         option("inferSchema", "true").csv("donation/block_1.csv")

good_features = ["cmp_lname_c1", "cmp_plz", "cmp_by", "cmp_bd", "cmp_bm"]
sum_expression = " + ".join(good_features)

scored = parsed.fillna(0, subset=good_features).\
         withColumn('score', expr(sum_expression)).\
         select('score', 'is_match')

scored.show()

+-----+--------+
|score|is_match|
+-----+--------+
|  4.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  4.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  4.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
+-----+--------+
only showing top 20 rows



In [9]:
def crossTabs(scored, t):
    return scored.selectExpr(f"score >= {t} as above", "is_match").\
    groupBy("above").pivot("is_match", ("true", "false")).count()

res=crossTabs(scored, 4.0).collect()

TP=res[0][1]
FP=res[0][2]
FN=res[1][1]
TN=res[1][2]

p=TP/(TP+FP)
r=TP/(TP+FN)
f1=2*(p*r/(p+r))
print('Precision = ',p,'Recall = ',r,'F1 Score =',f1)

Precision =  0.9693450998606595 Recall =  0.9971333014811276 F1 Score = 0.9830428638718794
