## Q1 

In [22]:
# Import necessary libraries and set up Spark session
import pyspark
import os
import sys
from pyspark import SparkContext
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
from pyspark.sql import SparkSession
spark = SparkSession.builder.config("spark.driver.memory", "16g").appName('chapter_2').getOrCreate()

# Read data from CSV files
prev = spark.read.option("recursiveFileLookup", "true").csv("donation/*.csv")

# Show a sample of the data
prev.show(2)

+-----+-----+-----------------+------------+------------+------------+-------+------+------+------+-------+--------+
|  _c0|  _c1|              _c2|         _c3|         _c4|         _c5|    _c6|   _c7|   _c8|   _c9|   _c10|    _c11|
+-----+-----+-----------------+------------+------------+------------+-------+------+------+------+-------+--------+
| id_1| id_2|     cmp_fname_c1|cmp_fname_c2|cmp_lname_c1|cmp_lname_c2|cmp_sex|cmp_bd|cmp_bm|cmp_by|cmp_plz|is_match|
|37291|53113|0.833333333333333|           ?|           1|           ?|      1|     1|     1|     1|      0|    TRUE|
+-----+-----+-----------------+------------+------------+------------+-------+------+------+------+-------+--------+
only showing top 2 rows



In [24]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

# Define a UDF for lowercasing each element in the array
lowercase_udf = udf(lambda tokens: [token.lower() for token in tokens], ArrayType(StringType()))

# Apply the UDF to the lname_tokens column
parsed = parsed.withColumn("lname_tokens", lowercase_udf(col("lname_tokens")))

# Display the updated DataFrame
parsed.show(5)

+-----+-----+-----------------+------------+------------+------------+-------+------+------+------+-------+--------+------------+
| id_1| id_2|     cmp_fname_c1|cmp_fname_c2|cmp_lname_c1|cmp_lname_c2|cmp_sex|cmp_bd|cmp_bm|cmp_by|cmp_plz|is_match|lname_tokens|
+-----+-----+-----------------+------------+------------+------------+-------+------+------+------+-------+--------+------------+
|37291|53113|0.833333333333333|        null|           1|        null|      1|     1|     1|     1|      0|    true|         [1]|
|39086|47614|                1|        null|           1|        null|      1|     1|     1|     1|      1|    true|         [1]|
|70031|70237|                1|        null|           1|        null|      1|     1|     1|     1|      1|    true|         [1]|
|84795|97439|                1|        null|           1|        null|      1|     1|     1|     1|      1|    true|         [1]|
|36950|42116|                1|        null|           1|           1|      1|     1|     

## Q2 

In [25]:
# Perform data analysis and compute similarity scores
from pyspark.sql.functions import col
parsed.groupBy("is_match").count().orderBy(col("count").desc()).show()

parsed.createOrReplaceTempView("linkage")

spark.sql("""
    SELECT is_match, COUNT(*) cnt
    FROM linkage
    GROUP BY is_match
    ORDER BY cnt DESC
""").show()

summary = parsed.describe()
summary.select("summary", "cmp_fname_c1", "cmp_fname_c2").show()

matches = parsed.where("is_match = true")
match_summary = matches.describe()
misses = parsed.filter(col("is_match") == False)
miss_summary = misses.describe()

+--------+-------+
|is_match|  count|
+--------+-------+
|   false|5728201|
|    true|  20931|
|    null|      1|
+--------+-------+

+--------+-------+
|is_match|    cnt|
+--------+-------+
|   false|5728201|
|    true|  20931|
|    null|      1|
+--------+-------+

+-------+--------------------+------------------+
|summary|        cmp_fname_c1|      cmp_fname_c2|
+-------+--------------------+------------------+
|  count|             5748126|            103699|
|   mean|  0.7129023464241683|0.9000089989364238|
| stddev|  0.3887584395082916|0.2713306768152377|
|    min|                   0|                 0|
|    max|2.68694413843136e-05|                 1|
+-------+--------------------+------------------+



## Q3 

In [30]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, expr
from pyspark.sql.types import DoubleType

# Assuming 'parsed' DataFrame is already defined

# Function for pivoting summary data
def pivot_summary(desc):
    desc_p = desc.toPandas()
    desc_p = desc_p.set_index('summary').transpose().reset_index()
    desc_p = desc_p.rename(columns={'index':'field'})
    desc_p = desc_p.rename_axis(None, axis=1)
    descT = spark.createDataFrame(desc_p)
    for c in descT.columns:
        if c == 'field':
            continue
        else:
            descT = descT.withColumn(c, descT[c].cast(DoubleType()))
    return descT

match_summaryT = pivot_summary(match_summary)
miss_summaryT = pivot_summary(miss_summary)
good_features = ["cmp_lname_c1", "cmp_plz", "cmp_by", "cmp_bd", "cmp_bm"]
sum_expression = " + ".join(good_features)

# Evaluate precision, recall, and F1-score
scored = parsed.fillna(0, subset=good_features).\
                withColumn('score', expr(sum_expression)).\
                select('score', 'is_match')

scored.show()

def crossTabs(scored: DataFrame, t: float) -> DataFrame:
    return scored.selectExpr(f"score >= {t} as above", "is_match").\
        groupBy("above").pivot("is_match", ("true", "false")).\
        count()

# Confusion matrix and evaluation metrics
confused = crossTabs(scored, 4.0)
confused.show()

confused2 = crossTabs(scored, 2.0)
confused2.show()

tp = confused.filter("above = true").select("true").collect()[0].true
fp = confused.filter("above = true").select("false").collect()[0].false
fn = confused.filter("above = false").select("true").fillna(0).collect()[0].true
tn = confused.filter("above = false").select("false").collect()[0].false

precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * precision * recall / (precision + recall)

precision, recall, f1

+-----+--------+
|score|is_match|
+-----+--------+
|  4.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  4.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  4.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
|  5.0|    true|
+-----+--------+
only showing top 20 rows

+-----+-----+-------+
|above| true|  false|
+-----+-----+-------+
| null|    6|    789|
| true|20871|    637|
|false|   54|5726775|
+-----+-----+-------+

+-----+-----+-------+
|above| true|  false|
+-----+-----+-------+
| null|    6|    789|
| true|20925| 596413|
|false| null|5130999|
+-----+-----+-------+



(0.9703831132601822, 0.9974193548387097, 0.9837155044422973)