In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Initialize SparkSession
spark = SparkSession.builder.appName("example").getOrCreate()

# Define schema for emp table
emp_schema = StructType([
    StructField("empid", IntegerType(), False),
    StructField("empname", StringType(), False),
    StructField("salary", IntegerType(), False),
    StructField("deptid", IntegerType(), False)
])

# Define schema for dept table
dept_schema = StructType([
    StructField("deptid", IntegerType(), False),
    StructField("deptname", StringType(), False)
])

# Create DataFrame for emp table
emp_data = [
    (1, 'Nikitha', 45000, 206),
    (2, 'Ashish', 42000, 207),
    (3, 'David', 40000, 206),
    (4, 'Ram', 50000, 207),
    (5, 'John', 35000, 208),
    (6, 'Mark', 50000, 207),
    (7, 'Aravind', 39000, 208)
]
emp_df = spark.createDataFrame(emp_data, schema=emp_schema)

# Create DataFrame for dept table
dept_data = [
    (206, 'HR'),
    (207, 'IT'),
    (208, 'Finance')
]
dept_df = spark.createDataFrame(dept_data, schema=dept_schema)

# Show the DataFrames
emp_df.display()
dept_df.display()


empid,empname,salary,deptid
1,Nikitha,45000,206
2,Ashish,42000,207
3,David,40000,206
4,Ram,50000,207
5,John,35000,208
6,Mark,50000,207
7,Aravind,39000,208


deptid,deptname
206,HR
207,IT
208,Finance


In [0]:
dept_df.createOrReplaceTempView('emp')
emp_df.createOrReplaceTempView('dept')

In [0]:
sql_query = """
WITH cte AS (
    SELECT empname, deptname, 
           RANK() OVER (PARTITION BY deptname ORDER BY salary DESC) AS rnk
    FROM emp
    INNER JOIN dept
    USING (deptid)
)
SELECT deptname, 
       CONCAT_WS(',', COLLECT_LIST(empname)) AS name
FROM cte
WHERE rnk = 1
GROUP BY deptname
"""

# Execute the SQL query
result_df = spark.sql(sql_query)

# Show the result
result_df.display()

deptname,name
Finance,Aravind
HR,Nikitha
IT,"Ram,Mark"


In [0]:
# Join the DataFrames
joined_df = emp_df.join(dept_df, on="deptid")

# Define the Window specification for ranking
window_spec = Window.partitionBy("deptname").orderBy(F.desc("salary"))

# Add rank column using the Window specification
ranked_df = joined_df.withColumn("rnk", F.rank().over(window_spec))

# Filter rows where rank is 1 (highest salary)
top_salaries_df = ranked_df.filter(F.col("rnk") == 1)

# Group by department and aggregate employee names
result_df = top_salaries_df.groupBy("deptname").agg(
    F.concat_ws(",", F.collect_list("empname")).alias("name")
)

# Show the result
result_df.display()

deptname,name
Finance,Aravind
HR,Nikitha
IT,"Ram,Mark"
