In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql import Window
from pyspark.sql.functions import col, rank, max as spark_max, min as spark_min, when

# Initialize Spark session
spark = SparkSession.builder.appName("CityPopulation").getOrCreate()

# Define the schema
schema = StructType([
    StructField("state", StringType(), True),
    StructField("city", StringType(), True),
    StructField("population", IntegerType(), True)
])

# Create a list of data
data = [
    ("haryana", "ambala", 100),
    ("haryana", "panipat", 200),
    ("haryana", "gurgaon", 300),
    ("punjab", "amritsar", 150),
    ("punjab", "ludhiana", 400),
    ("punjab", "jalandhar", 250),
    ("maharashtra", "mumbai", 1000),
    ("maharashtra", "pune", 600),
    ("maharashtra", "nagpur", 300),
    ("karnataka", "bangalore", 900),
    ("karnataka", "mysore", 400),
    ("karnataka", "mangalore", 200)
]

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Show the DataFrame
df.display()


state,city,population
haryana,ambala,100
haryana,panipat,200
haryana,gurgaon,300
punjab,amritsar,150
punjab,ludhiana,400
punjab,jalandhar,250
maharashtra,mumbai,1000
maharashtra,pune,600
maharashtra,nagpur,300
karnataka,bangalore,900


In [0]:
# Define the window specification for ranking by population descending and ascending within each state
window_desc = Window.partitionBy("state").orderBy(col("population").desc())
window_asc = Window.partitionBy("state").orderBy(col("population").asc())

# Add the rank columns to the DataFrame
df_with_ranks = df.withColumn("rno_desc", rank().over(window_desc)) \
                  .withColumn("rno_asc", rank().over(window_asc))

# Select the state, city with the max and min population based on the rank columns
result_df = df_with_ranks.filter((col("rno_desc") == 1) | (col("rno_asc") == 1)) \
    .groupBy("state") \
    .agg(
        spark_max(when(col("rno_desc") == 1, col("city"))).alias("max_population"),
        spark_min(when(col("rno_asc") == 1, col("city"))).alias("min_population")
    )

# Show the result
result_df.display()


state,max_population,min_population
haryana,gurgaon,ambala
karnataka,bangalore,mangalore
maharashtra,mumbai,nagpur
punjab,ludhiana,amritsar


In [0]:
df.createOrReplaceTempView("city_population")

In [0]:
%sql
with cte as (
select *,rank()over(partition by state order by population desc)as rno_desc,
rank()over(partition by state order by population )as rno_asc from city_population
)
select state,
max(case when rno_desc=1 then city else null end) as max_population,
min(case when rno_asc=1 then city else null end) as min_population
from cte
where rno_asc=1 or rno_desc=1
group by state

state,max_population,min_population
haryana,gurgaon,ambala
karnataka,bangalore,mangalore
maharashtra,mumbai,nagpur
punjab,ludhiana,amritsar
