In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StringType, IntegerType, StructField,StructType
from pyspark.sql.functions import col

In [0]:
spark = SparkSession.builder.appName("app").master("local[2]").getOrCreate()

In [0]:
data = [
    (1,"a@b.com"),
    (2,"c@d.com"),
    (3,"a@b.com")
]

schema = StructType([
    StructField("id", IntegerType(),True),
    StructField("email",StringType(),True)
])

emails = spark.createDataFrame(data,schema)
emails.show()

+---+-------+
| id|  email|
+---+-------+
|  1|a@b.com|
|  2|c@d.com|
|  3|a@b.com|
+---+-------+



In [0]:
# For PySpark users, drop duplicate email and keep email with small id
emails.drop_duplicates(["email"]).show()


+---+-------+
| id|  email|
+---+-------+
|  1|a@b.com|
|  2|c@d.com|
+---+-------+

+---+-------+
| id|  email|
+---+-------+
|  3|a@b.com|
|  2|c@d.com|
+---+-------+



In [0]:
# Write a solution to delete all duplicate emails, keeping only one unique email with the biggest id.
emails.orderBy(col("id").desc()).dropDuplicates(["email"]).show()

+---+-------+
| id|  email|
+---+-------+
|  3|a@b.com|
|  2|c@d.com|
+---+-------+



In [0]:
# write query to drop duplicate email
emails.createOrReplaceTempView("dup_email")
spark.sql("with cte as (select id,email,row_number() over(partition by email order by id) as num from dup_email) select id,email from cte where num=1;").show()

+---+-------+
| id|  email|
+---+-------+
|  1|a@b.com|
|  2|c@d.com|
+---+-------+



In [0]:
spark.stop()