In [0]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("RemoveDuplicates").getOrCreate()

# Sample data
data = [
    (1, 'leaf', 'black', 'nissan'),
    (2, 'leaf', 'black', 'nissan'),
    (3, 'model S', 'black', 'tesla'),
    (4, 'model X', 'white', 'tesla'),
    (5, 'ioniq 5', 'black', 'hyundai'),
    (6, 'ioniq 5', 'black', 'hyundai'),
    (7, 'ioniq 6', 'white', 'hyundai')
]

# Define schema
columns = ["id", "name", "color", "brand"]

# Create DataFrame
df = spark.createDataFrame(data, columns)
df.display()


id,name,color,brand
1,leaf,black,nissan
2,leaf,black,nissan
3,model S,black,tesla
4,model X,white,tesla
5,ioniq 5,black,hyundai
6,ioniq 5,black,hyundai
7,ioniq 6,white,hyundai


In [0]:
# Remove duplicates based on specific columns
df_unique = df.dropDuplicates(["name", "color", "brand"])
df_unique.display()


id,name,color,brand
1,leaf,black,nissan
3,model S,black,tesla
4,model X,white,tesla
5,ioniq 5,black,hyundai
7,ioniq 6,white,hyundai


In [0]:
df.createOrReplaceTempView('cars')

In [0]:
# SQL query to remove duplicates
query = """
WITH CTE_Duplicates AS (
    SELECT 
        id, 
        name, 
        color, 
        brand,
        ROW_NUMBER() OVER (PARTITION BY name, color, brand ORDER BY id) AS rn
    FROM cars
)
SELECT id, name, color, brand
FROM CTE_Duplicates
WHERE rn = 1
"""

# Execute the query and get the result as a DataFrame
df_unique = spark.sql(query)
df_unique.display()



id,name,color,brand
5,ioniq 5,black,hyundai
7,ioniq 6,white,hyundai
1,leaf,black,nissan
3,model S,black,tesla
4,model X,white,tesla
