In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, split, trim, explode

# --- Initialize Spark ---
spark = SparkSession.builder.appName("Task6_DataPreprocessing").getOrCreate()

# --- Sample Raw Data (simulating dirty entity names) ---
data = [
    (1, "Alice   Smith!!"),
    (2, "bob-smith"),
    (3, "CHARLIE   Johnson"),
    (4, "david johnson"),
    (5, "Alice SMITH"),
]

columns = ["id", "raw_name"]
df = spark.createDataFrame(data, columns)

print("✅ Original Raw Data:")
df.show(truncate=False)

# --- Step 1: Normalization ---
# - Convert to lowercase
# - Remove punctuation/special characters
# - Trim spaces
df_clean = df.withColumn("clean_name",
    lower(trim(regexp_replace(col("raw_name"), "[^a-zA-Z\\s]", "")))
)

print("✅ Normalized Data:")
df_clean.show(truncate=False)

# --- Step 2: Tokenization ---
# Split names into tokens (words)
df_tokens = df_clean.withColumn("tokens", split(col("clean_name"), "\\s+"))

print("✅ Tokenized Data:")
df_tokens.show(truncate=False)

# --- Step 3: Explode tokens (optional for entity resolution) ---
df_exploded = df_tokens.withColumn("token", explode(col("tokens")))

print("✅ Exploded Tokens (one word per row):")
df_exploded.show(truncate=False)

# Stop Spark
spark.stop()


✅ Original Raw Data:
+---+-----------------+
|id |raw_name         |
+---+-----------------+
|1  |Alice   Smith!!  |
|2  |bob-smith        |
|3  |CHARLIE   Johnson|
|4  |david johnson    |
|5  |Alice SMITH      |
+---+-----------------+

✅ Normalized Data:
+---+-----------------+-----------------+
|id |raw_name         |clean_name       |
+---+-----------------+-----------------+
|1  |Alice   Smith!!  |alice   smith    |
|2  |bob-smith        |bobsmith         |
|3  |CHARLIE   Johnson|charlie   johnson|
|4  |david johnson    |david johnson    |
|5  |Alice SMITH      |alice smith      |
+---+-----------------+-----------------+

✅ Tokenized Data:
+---+-----------------+-----------------+------------------+
|id |raw_name         |clean_name       |tokens            |
+---+-----------------+-----------------+------------------+
|1  |Alice   Smith!!  |alice   smith    |[alice, smith]    |
|2  |bob-smith        |bobsmith         |[bobsmith]        |
|3  |CHARLIE   Johnson|charlie   johnson|