#### **Removing duplicates in a dataframe using spark**

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import col, rank

# Initialize Spark session
spark = SparkSession.builder.appName("RemoveDuplicates").getOrCreate()

# Sample data with duplicates
data = [
    (1, "Alice", 29),
    (2, "Bob", 34),
    (3, "Charlie", 30),
    (1, "Alice", 29),  # Duplicate record
    (4, "David", 23),
    (2, "Bob", 34),    # Duplicate record
    (5, "Eve", 40)
]

# Define the schema for the DataFrame
schema = ["ID", "Name", "Age"]

# Create DataFrame
df = spark.createDataFrame(data, schema)

df.show()

# Define the window specification
windowSpec = Window.partitionBy("ID", "Name", "Age").orderBy("ID")

# Assign rank to each row within the group
df_with_rank = df.withColumn("rank", rank().over(windowSpec))

# Filter out rows where rank is greater than 1 (i.e., duplicates)
df_without_duplicates = df_with_rank.filter(col("rank") == 1).drop("rank")

# Show the DataFrame without duplicates
df_without_duplicates.show()


StatementMeta(, bececec8-6d08-4167-a8b7-c3cd04d6a569, 5, Finished, Available, Finished)

+---+-------+---+
| ID|   Name|Age|
+---+-------+---+
|  1|  Alice| 29|
|  2|    Bob| 34|
|  3|Charlie| 30|
|  1|  Alice| 29|
|  4|  David| 23|
|  2|    Bob| 34|
|  5|    Eve| 40|
+---+-------+---+

+---+-------+---+
| ID|   Name|Age|
+---+-------+---+
|  1|  Alice| 29|
|  1|  Alice| 29|
|  2|    Bob| 34|
|  2|    Bob| 34|
|  3|Charlie| 30|
|  4|  David| 23|
|  5|    Eve| 40|
+---+-------+---+

