In [9]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, StringType

# Create a Spark session
spark = SparkSession.builder.appName("EntityResolutionPreprocessing").getOrCreate()

# Read CSV data
data = spark.read.csv("block_1.csv", header=True, inferSchema=True, sep='\t')

# Display the original data
print("Original Data:")
data.show(truncate=False)

# Select relevant columns for preprocessing
selected_columns = ['cmp_fname_c1', 'cmp_fname_c2', 'cmp_lname_c1', 'cmp_lname_c2', 'cmp_sex', 'cmp_bd', 'cmp_bm', 'cmp_by', 'cmp_plz']
data_to_preprocess = data.select(['id_1', 'id_2'] + selected_columns)

# Tokenization
for column in selected_columns:
    tokenizer = Tokenizer(inputCol=column, outputCol=column+'_tokens')
    data_to_preprocess = tokenizer.transform(data_to_preprocess)

# Define a UDF for normalization
def normalize_text(tokens):
    return [str(token).lower() if token is not None else None for token in tokens]

# Register the UDF
normalize_text_udf = udf(normalize_text, ArrayType(StringType()))

# Apply normalization UDF to each tokenized column
for column in selected_columns:
    data_to_preprocess = data_to_preprocess.withColumn(column+'_normalized_tokens', normalize_text_udf(col(column+'_tokens')))

# Display the preprocessed data
print("Preprocessed Data:")
data_to_preprocess.show(truncate=False)

# Stop the Spark session
spark.stop()


Original Data:
+-----------------------------------------------------------------------------------------------------------------------------------+
|"id_1","id_2","cmp_fname_c1","cmp_fname_c2","cmp_lname_c1","cmp_lname_c2","cmp_sex","cmp_bd","cmp_bm","cmp_by","cmp_plz","is_match"|
+-----------------------------------------------------------------------------------------------------------------------------------+
|37291,53113,0.833333333333333,?,1,?,1,1,1,1,0,TRUE                                                                                 |
|39086,47614,1,?,1,?,1,1,1,1,1,TRUE                                                                                                 |
|70031,70237,1,?,1,?,1,1,1,1,1,TRUE                                                                                                 |
|84795,97439,1,?,1,?,1,1,1,1,1,TRUE                                                                                                 |
|36950,42116,1,?,1,1,1,1,1,1,1,TRUE            

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `id_1` cannot be resolved. Did you mean one of the following? [`"id_1","id_2","cmp_fname_c1","cmp_fname_c2","cmp_lname_c1","cmp_lname_c2","cmp_sex","cmp_bd","cmp_bm","cmp_by","cmp_plz","is_match"`].;
'Project ['id_1, 'id_2, 'cmp_fname_c1, 'cmp_fname_c2, 'cmp_lname_c1, 'cmp_lname_c2, 'cmp_sex, 'cmp_bd, 'cmp_bm, 'cmp_by, 'cmp_plz]
+- Relation ["id_1","id_2","cmp_fname_c1","cmp_fname_c2","cmp_lname_c1","cmp_lname_c2","cmp_sex","cmp_bd","cmp_bm","cmp_by","cmp_plz","is_match"#162] csv
