In [28]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, avg

In [29]:
spark = SparkSession.builder.appName("PySpark Tutorial").master("local[*]").config("spark.driver.host", "localhost").config("spark.executor.memory", "4g").config("spark.driver.memory", "2g").getOrCreate()

24/11/28 14:44:05 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [30]:
print(spark.sparkContext.pythonExec)

python3


In [31]:
data = [("John", 28), ("Anna", 23), ("Peter", 34)]
columns = ["Name", "Age"]

In [32]:
#Create dataframe
df = spark.createDataFrame(data, columns)

In [33]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)



In [34]:
df.show()

                                                                                

+-----+---+
| Name|Age|
+-----+---+
| John| 28|
| Anna| 23|
|Peter| 34|
+-----+---+



In [35]:
df1 = df.withColumn("life_stage",when(col("age") < 13, "child").when(col("age").between(13, 19), "teenager").otherwise("adult"))

In [36]:
df1.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- life_stage: string (nullable = false)



In [37]:
df1.show()

+-----+---+----------+
| Name|Age|life_stage|
+-----+---+----------+
| John| 28|     adult|
| Anna| 23|     adult|
|Peter| 34|     adult|
+-----+---+----------+



In [38]:
df.show()

+-----+---+
| Name|Age|
+-----+---+
| John| 28|
| Anna| 23|
|Peter| 34|
+-----+---+



In [39]:
df1.where(col("life_stage").isin(["teenager", "adult"])).show()

+-----+---+----------+
| Name|Age|life_stage|
+-----+---+----------+
| John| 28|     adult|
| Anna| 23|     adult|
|Peter| 34|     adult|
+-----+---+----------+



In [40]:
#Add new rows
new_rows = [['Sue', 11, 'child'], ['Adam', 15, 'teenager']]

In [41]:
new_df = spark.createDataFrame(new_rows, columns)

In [42]:
#Append new rows to the existing dataframe
df1 = df1.union(new_df)
df1.show()

+-----+---+----------+
| Name|Age|life_stage|
+-----+---+----------+
| John| 28|     adult|
| Anna| 23|     adult|
|Peter| 34|     adult|
|  Sue| 11|     child|
| Adam| 15|  teenager|
+-----+---+----------+



In [43]:
df1.where(col("life_stage").isin(["teenager", "adult"])).show()

+-----+---+----------+
| Name|Age|life_stage|
+-----+---+----------+
| John| 28|     adult|
| Anna| 23|     adult|
|Peter| 34|     adult|
| Adam| 15|  teenager|
+-----+---+----------+



In [44]:
#perform aggregate function(avg)
df1.select(avg('age')).show()



+--------+
|avg(age)|
+--------+
|    22.2|
+--------+



                                                                                

In [45]:
df1.select(avg('age').alias('age_average')).show()

+-----------+
|age_average|
+-----------+
|       22.2|
+-----------+



In [46]:
#Group by function
df1.groupBy("life_stage").agg(avg('age').alias('age_average')).show()

+----------+------------------+
|life_stage|       age_average|
+----------+------------------+
|     adult|28.333333333333332|
|     child|              11.0|
|  teenager|              15.0|
+----------+------------------+



In [47]:
spark.sql("select life_stage, avg(age) as age_average from {df1} group by life_stage", df1=df1).show()

+----------+------------------+
|life_stage|       age_average|
+----------+------------------+
|     adult|28.333333333333332|
|     child|              11.0|
|  teenager|              15.0|
+----------+------------------+



In [48]:
#count function
df1.groupBy("life_stage").count().show()

+----------+-----+
|life_stage|count|
+----------+-----+
|     adult|    3|
|     child|    1|
|  teenager|    1|
+----------+-----+



In [49]:
df1.sort(df1.life_stage.desc()).show()

                                                                                

+-----+---+----------+
| Name|Age|life_stage|
+-----+---+----------+
| Adam| 15|  teenager|
|  Sue| 11|     child|
| Anna| 23|     adult|
|Peter| 34|     adult|
| John| 28|     adult|
+-----+---+----------+



In [50]:
df1.orderBy(col('life_stage').desc()).show()

+-----+---+----------+
| Name|Age|life_stage|
+-----+---+----------+
| Adam| 15|  teenager|
|  Sue| 11|     child|
|Peter| 34|     adult|
| Anna| 23|     adult|
| John| 28|     adult|
+-----+---+----------+



In [51]:
spark.sql("select avg(age) as age_average from {df1}", df1=df1).show()

+-----------+
|age_average|
+-----------+
|       22.2|
+-----------+



In [52]:
spark.sql("select life_stage, avg(age) from {df1} group by life_stage", df1=df1).show()

+----------+------------------+
|life_stage|          avg(age)|
+----------+------------------+
|     adult|28.333333333333332|
|     child|              11.0|
|  teenager|              15.0|
+----------+------------------+



In [53]:
#aliasing
spark.sql("select life_stage, avg(age) as age_average from {df1} group by life_stage", df1=df1).show()

+----------+------------------+
|life_stage|       age_average|
+----------+------------------+
|     adult|28.333333333333332|
|     child|              11.0|
|  teenager|              15.0|
+----------+------------------+



In [54]:
#End the session
spark.stop()