One of the SQL questions recently asked in Hexaware interview.

Given us Employees table, find out the max ID from Employees excluding duplicates.


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StructType, StructField
from pyspark.sql.window import Window
from pyspark.sql.functions import col, count, max as max_

# Create a Spark session
spark = SparkSession.builder.appName("Employees").getOrCreate()

# Define the schema
schema = StructType([
    StructField("id", IntegerType(), True)
])

# Create a list of data (mimicking the SQL insert statement)
data = [(2,), (5,), (6,), (6,), (7,), (8,), (8,)]

# Create a DataFrame
df = spark.createDataFrame(data, schema)

# Show the DataFrame
df.display()


id
2
5
6
6
7
8
8


In [0]:
df.createOrReplaceTempView('employees')

In [0]:
%sql
SELECT id, count(*) OVER(PARTition by id order by id) as a FROM employees

id,a
2,1
5,1
6,2
6,2
7,1
8,2
8,2


In [0]:
%sql
with cte as (
 SELECT id, count(*) OVER(PARTition by id order by id) as a FROM employees
 ) select max(id) from cte
 where  a<=1

max(id)
7


In [0]:
# Define a window specification for partitioning by 'id' and ordering by 'id'
window_spec = Window.partitionBy("id").orderBy("id")

# Add a count column with a window function
df_with_count = df.withColumn("a", count("id").over(window_spec))

# Filter where count column 'a' is less than or equal to 1
filtered_df = df_with_count.filter(col("a") <= 1)

# Get the maximum id from the filtered DataFrame
result = filtered_df.agg(max_("id").alias("max_id"))
result.display()

max_id
7
