In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, LongType, IntegerType, StringType

# Initialize Spark session
spark = SparkSession.builder.appName("PySparkExample").getOrCreate()

# Schema for 'cards' table
cards_schema = StructType([
    StructField("card_number", LongType(), True)
])

# Data for 'cards' table
cards_data = [
    (1234567812345678,),
    (2345678923456789,),
    (3456789034567890,)
]

# Create DataFrame for 'cards' table
cards_df = spark.createDataFrame(cards_data, schema=cards_schema)
cards_df.show()

# Schema for 'Employee' table
employee_schema = StructType([
    StructField("employee_id", IntegerType(), True),
    StructField("ename", StringType(), True),
    StructField("salary", IntegerType(), True)
])

# Data for 'Employee' table
employee_data = [
    (3, 'Bob', 60000),
    (4, 'Diana', 70000),
    (5, 'Eve', 60000),
    (6, 'Frank', 80000),
    (7, 'Grace', 70000),
    (8, 'Henry', 90000)
]

# Create DataFrame for 'Employee' table
employee_df = spark.createDataFrame(employee_data, schema=employee_schema)
employee_df.show()


+----------------+
|     card_number|
+----------------+
|1234567812345678|
|2345678923456789|
|3456789034567890|
+----------------+

+-----------+-----+------+
|employee_id|ename|salary|
+-----------+-----+------+
|          3|  Bob| 60000|
|          4|Diana| 70000|
|          5|  Eve| 60000|
|          6|Frank| 80000|
|          7|Grace| 70000|
|          8|Henry| 90000|
+-----------+-----+------+



In [0]:
from pyspark.sql.functions import col, expr

# Convert card_number to a string and mask the first 12 digits
masked_cards_df = cards_df.withColumn(
    "masked_card_number",
    expr("concat('************', substring(card_number, 13, 4))")
)

masked_cards_df.display()


card_number,masked_card_number
1234567812345678,************5678
2345678923456789,************6789
3456789034567890,************7890


In [0]:
from pyspark.sql.functions import count

# Group by salary and count the number of employees for each salary
salary_count_df = employee_df.groupBy("salary").count()

# Filter to find salaries that have more than one employee
same_salary_df = salary_count_df.filter(col("count") > 1)

# Join the result back with the original employee DataFrame to get the employee details
result_df = employee_df.join(same_salary_df, on="salary", how="inner")

result_df.display()


salary,employee_id,ename,count
60000,3,Bob,2
70000,4,Diana,2
60000,5,Eve,2
70000,7,Grace,2
