In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
from pyspark.sql.functions import col, when, avg, count, rand, lit, round as spark_round

# Define Schema
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("score", FloatType(), True),
    StructField("city", StringType(), True)
])

# Sample data
data = [
    (1, "Alice", 23, 88.5, "New York"),
    (2, "Bob", 31, 76.0, "Los Angeles"),
    (3, "Charlie", 29, 92.3, "Chicago"),
    (4, "David", 35, 67.8, "Houston"),
    (5, "Eva", 22, 81.2, "Phoenix"),
    (6, "Frank", 28, 73.4, "Philadelphia"),
    (7, "Grace", 27, 89.1, "San Antonio"),
    (8, "Helen", 24, 95.0, "San Diego"),
    (9, "Ian", 30, 78.6, "Dallas"),
    (10, "Jane", 26, 85.7, "San Jose")
]

# Create dataFrame
df = spark.createDataFrame(data, schema)

# Add a random bonus column
df = df.withColumn("bonus", spark_round(rand() * 10, 2))

# Categorize age groups
df = df.withColumn(
    "age_group",
    when(col("age") < 25, "Young")
    .when((col("age") >= 25) & (col("age") < 30), "Adult")
    .otherwise("Senior")
)

# Calculate adjusted score
df = df.withColumn("adjusted_score", spark_round(col("score") + col("bonus"), 2))

# Filter for high performers
high_performers = df.filter(col("adjusted_score") > 90)

# Group by city and get average adjusted score
city_avg = df.groupBy("city").agg(
    avg("adjusted_score").alias("avg_adjusted_score"),
    count("*").alias("num_people")
)

# Join high performers with city averages
joined = high_performers.join(city_avg, "city", "left")

# Add a flag for cities with more than 1 person
joined = joined.withColumn(
    "large_city",
    when(col("num_people") > 1, lit(True)).otherwise(lit(False))
)

# Select and order columns
final = joined.select(
    "id", "name", "city", "adjusted_score", "avg_adjusted_score", "large_city"
).orderBy(col("adjusted_score").desc())

display(final)