Use Case Scenario:

From the given dataset, the task is to remove duplicates based on emp_id, and keep only the most recent record based on the update_date.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window

In [0]:
spark = SparkSession.builder.appName("RemoDuplicateRecords").getOrCreate()

Sample Dataset

In [0]:
data = [
  (101,"Sedhu Madhavan","2025-01-01"),
  (101,"Sedhu Madhavan","2025-05-12"),
  (102,"Santhosh Kumar","2025-02-15"),
  (103,"Logeswaran","2025-01-20"),
  (103,"Logeswaran","2025-06-30"),
  (104,"Vignesh Dharuman","2025-05-24"),
  (104,"Vignesh Dharuman","2025-07-01"),
  (105,"Vishnu","2025-03-10"),
  (105,"Vishnu","2025-07-09")
]

Creating Dataframe

In [0]:
columns = ["emp_id","emp_name","update_date"]
df = spark.createDataFrame(data,columns)
df.show()

Identifying duplicate records

In [0]:
window_spec = Window.partitionBy("emp_id").orderBy(df["update_date"].desc())

Assigning row_numbers

In [0]:
rn = df.withColumn("row_num",row_number().over(window_spec))

Filtering only latest records

In [0]:
final_df = rn.filter("row_num = 1").drop("row_num")

Output:

In [0]:
final_df.show(truncate=False)