In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, avg
spark = SparkSession.builder.appName("demo").config("spark.ui.port", "4041").getOrCreate()
spark

In [10]:
# Create a Spark DataFrame
print("Spark DataFrame")
df = spark.createDataFrame(
    [
        ("Franzi", 25),
        ("Srirag",36),
        ("Nishtha", 26),
        ("Nico", 24),
        ("David", 26)
    ],
    ["first_name", "age"],
)
df.show()

Spark DataFrame
+----------+---+
|first_name|age|
+----------+---+
|    Franzi| 25|
|    Srirag| 36|
|   Nishtha| 26|
|      Nico| 24|
|     David| 26|
+----------+---+



In [11]:
# Add a column to a Spark DataFrame
print("Spark DataFrame with new column Life Stage")
df1 = df.withColumn(
    "life_stage",
    when(col("age") < 13, "child")
    .when(col("age").between(13, 25), "teenager")
    .otherwise("adult"),
)
df1.show()

Spark DataFrame with new column Life Stage
+----------+---+----------+
|first_name|age|life_stage|
+----------+---+----------+
|    Franzi| 25|  teenager|
|    Srirag| 36|     adult|
|   Nishtha| 26|     adult|
|      Nico| 24|  teenager|
|     David| 26|     adult|
+----------+---+----------+



In [12]:
# Filter a Spark DataFrame
print("Filtered DataFrame with only teenagers and adults")
df1.where(col("life_stage").isin(["teenager", "adult"])).show()

Filtered DataFrame with only teenagers and adults
+----------+---+----------+
|first_name|age|life_stage|
+----------+---+----------+
|    Franzi| 25|  teenager|
|    Srirag| 36|     adult|
|   Nishtha| 26|     adult|
|      Nico| 24|  teenager|
|     David| 26|     adult|
+----------+---+----------+



In [13]:
# Group by aggregation on Spark DataFrame
print("Compute the average age")
df1.select(avg("age")).show()

Compute the average age
+--------+
|avg(age)|
+--------+
|    27.4|
+--------+



In [14]:
print("Compute average age for life stage")
df1.groupBy("life_stage").avg().show()

Compute average age for life stage
+----------+------------------+
|life_stage|          avg(age)|
+----------+------------------+
|  teenager|              24.5|
|     adult|29.333333333333332|
+----------+------------------+



In [15]:
# spark.stop()