### **REGEXP_REPLACE and REGEXP_EXTRACT in PySpark**

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, regexp_extract

# Initialize Spark session (if not already initialized)
spark = SparkSession.builder.appName("RegexpExample").getOrCreate()

# Sample data
data = [
    (1, "Alice Smith", "123-45-6789", "alice.smith@example.com"),
    (2, "Bob Brown", "987-65-4321", "bob.brown@example.com"),
    (3, "Charlie Clark", "111-22-3333", "charlie.clark@example.com")
]

# Define the schema for the DataFrame
schema = ["ID", "Name", "SSN", "Email"]

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Show the original DataFrame
df.show(truncate=False)


StatementMeta(, a0fe634b-58da-4b9b-8e64-214bb6f45843, 3, Finished, Available, Finished)

+---+-------------+-----------+-------------------------+
|ID |Name         |SSN        |Email                    |
+---+-------------+-----------+-------------------------+
|1  |Alice Smith  |123-45-6789|alice.smith@example.com  |
|2  |Bob Brown    |987-65-4321|bob.brown@example.com    |
|3  |Charlie Clark|111-22-3333|charlie.clark@example.com|
+---+-------------+-----------+-------------------------+



In [2]:
# Mask the SSN using REGEXP_REPLACE (keeping only the last 4 digits)
df_masked = df.withColumn("Masked_SSN", regexp_replace("SSN", r"\d{3}-\d{2}", "***-**"))

# Extract domain from the email using REGEXP_EXTRACT
df_extracted = df_masked.withColumn("Email_Domain", regexp_extract("Email", r"@(\w+\.\w+)", 1))

# Show the resulting DataFrame
df_extracted.show(truncate=False)

StatementMeta(, a0fe634b-58da-4b9b-8e64-214bb6f45843, 4, Finished, Available, Finished)

+---+-------------+-----------+-------------------------+-----------+------------+
|ID |Name         |SSN        |Email                    |Masked_SSN |Email_Domain|
+---+-------------+-----------+-------------------------+-----------+------------+
|1  |Alice Smith  |123-45-6789|alice.smith@example.com  |***-**-6789|example.com |
|2  |Bob Brown    |987-65-4321|bob.brown@example.com    |***-**-4321|example.com |
|3  |Charlie Clark|111-22-3333|charlie.clark@example.com|***-**-3333|example.com |
+---+-------------+-----------+-------------------------+-----------+------------+

