In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum, avg, min, max

# Ensure Spark uses correct Python (bigdataenv)
os.environ["PYSPARK_PYTHON"] = r"C:\Users\subba\anaconda3\envs\bigdataenv\python.exe"
os.environ["PYSPARK_DRIVER_PYTHON"] = r"C:\Users\subba\anaconda3\envs\bigdataenv\python.exe"

# Start Spark Session
spark = SparkSession.builder \
    .appName("BasicAggregations") \
    .config("spark.driver.memory", "2g") \
    .config("spark.python.worker.reuse", "false") \
    .getOrCreate()

# Sample DataFrame
data = [
    ("Alice", 23, "HR"),
    ("Bob", 30, "IT"),
    ("Charlie", 28, "IT"),
    ("David", 35, "HR"),
    ("Eva", 40, "Finance")
]
columns = ["name", "age", "department"]

df = spark.createDataFrame(data, columns)

print("✅ Original DataFrame:")
df.show()

# --- Aggregations ---
print("\n✅ Calculate SUM and AVERAGE of 'age':")
df.agg(
    sum("age").alias("total_age"),
    avg("age").alias("average_age")
).show()

# --- Grouped Aggregations ---
print("\n✅ Average age per department:")
df.groupBy("department").agg(
    avg("age").alias("avg_age")
).show()

spark.stop()


✅ Original DataFrame:
+-------+---+----------+
|   name|age|department|
+-------+---+----------+
|  Alice| 23|        HR|
|    Bob| 30|        IT|
|Charlie| 28|        IT|
|  David| 35|        HR|
|    Eva| 40|   Finance|
+-------+---+----------+


✅ Calculate SUM and AVERAGE of 'age':
+---------+-----------+
|total_age|average_age|
+---------+-----------+
|      156|       31.2|
+---------+-----------+


✅ Average age per department:
+----------+-------+
|department|avg_age|
+----------+-------+
|        HR|   29.0|
|        IT|   29.0|
|   Finance|   40.0|
+----------+-------+

