In [0]:
from pyspark.sql.functions import col, concat, lit, substring_index

# Input Data
data = [
    (1, "john.doe@gmail.com", "8956756457"),
    (2, "jane.smith@outlook.com", "9845671234"),
    (3, "alan.turing@yahoo.com", "9123456789"),
]

columns = ["id", "email", "phone"]

df = spark.createDataFrame(data, columns)
df.display()

id,email,phone
1,john.doe@gmail.com,8956756457
2,jane.smith@outlook.com,9845671234
3,alan.turing@yahoo.com,9123456789


In [0]:
# Mask email: keep first char, add ****, keep domain
masked_email = (
    concat(
        substring_index(col("email"), ".", 1).substr(1, 1),  # first letter of name
        lit("****"),
        lit("@"),
        substring_index(col("email"), "@", -1)  # domain part
    ).alias("masked_email")
)

# Mask phone: replace first 5 digits with *****
masked_phone = (
    concat(
        lit("*****"),
        col("phone").substr(-4, 4)  # last 4 digits
    ).alias("masked_phone")
)

# Final dataframe
result_df = df.select(
    col("id"),
    masked_email,
    masked_phone
)

result_df.display()

id,masked_email,masked_phone
1,j****@gmail.com,*****6457
2,j****@outlook.com,*****1234
3,a****@yahoo.com,*****6789


In [0]:
# Create a temporary view
df.createOrReplaceTempView("users")

In [0]:
# Spark SQL Query for masking
result_df = spark.sql("""
    SELECT 
        id,
        CONCAT(
            SUBSTR(email, 1, 1),                -- first letter
            '****',
            '@',
            SPLIT(email, '@')[1]                -- domain part
        ) AS masked_email,
        CONCAT(
            '*****',
            SUBSTR(phone, -4)                   -- last 4 digits
        ) AS masked_phone
    FROM users
""")

result_df.show(truncate=False)

+---+-----------------+------------+
|id |masked_email     |masked_phone|
+---+-----------------+------------+
|1  |j****@gmail.com  |*****6457   |
|2  |j****@outlook.com|*****1234   |
|3  |a****@yahoo.com  |*****6789   |
+---+-----------------+------------+

