In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType,StructField,StringType,IntegerType
from pyspark.sql.functions import col,sum

spark  = SparkSession.builder.appName("app").master("local[2]").getOrCreate()

In [0]:
schema = StructType([
    StructField("id",IntegerType(),False),
    StructField("name",StringType(),False)
])

data = [
    ( 1    , "Alice"    ) ,
    ( 2    , "Bob"      ) ,
    ( 3    , "Alex"     ) ,
    ( 4    , "Donald"   ) ,
    ( 7    , "Lee"      ) ,
    ( 13   , "Jonathan" ) ,
    ( 19   , "Elvis"    ) 
]
users = spark.createDataFrame(data,schema)
users.show()


+---+--------+
| id|    name|
+---+--------+
|  1|   Alice|
|  2|     Bob|
|  3|    Alex|
|  4|  Donald|
|  7|     Lee|
| 13|Jonathan|
| 19|   Elvis|
+---+--------+



In [0]:
schema = StructType([
    StructField("id",IntegerType(),False),
    StructField("user_id",IntegerType(),False),
    StructField("distance",IntegerType(),False)
])
data = [
    ( 1    , 1        , 120  )    ,
    ( 2    , 2        , 317  )    ,
    ( 3    , 3        , 222  )    ,
    ( 4    , 7        , 100  )    ,
    ( 5    , 13       , 312  )    ,
    ( 6    , 19       , 50   )    ,
    ( 7    , 7        , 120  )    ,
    ( 8    , 19       , 400  )    ,
    ( 9    , 7        , 230  )    
]

trips = spark.createDataFrame(data,schema)
trips.show()


+---+-------+--------+
| id|user_id|distance|
+---+-------+--------+
|  1|      1|     120|
|  2|      2|     317|
|  3|      3|     222|
|  4|      7|     100|
|  5|     13|     312|
|  6|     19|      50|
|  7|      7|     120|
|  8|     19|     400|
|  9|      7|     230|
+---+-------+--------+



In [0]:
# Write a solution to report the distance traveled by each user. Return the result table ordered by travelled_distance in descending order, 
# if two or more users traveled the same distance, order them by their name in ascending order.

trips.groupBy("user_id").agg(sum("distance").alias("travelled_distance")).join(users,trips.user_id==users.id,"right").select(users.name,"travelled_distance").fillna(0,["travelled_distance"]).orderBy(col("travelled_distance").desc(),col("name")).show()

+--------+------------------+
|    name|travelled_distance|
+--------+------------------+
|   Elvis|               450|
|     Lee|               450|
|     Bob|               317|
|Jonathan|               312|
|    Alex|               222|
|   Alice|               120|
|  Donald|                 0|
+--------+------------------+



In [0]:
users.createOrReplaceTempView("u")
trips.createOrReplaceTempView("t")

spark.sql("select u.name,coalesce(sum(t.distance),0) as travelled_distance from t right join u on t.user_id=u.id group by u.id,u.name order by 2 desc,1;").show()

+--------+------------------+
|    name|travelled_distance|
+--------+------------------+
|   Elvis|               450|
|     Lee|               450|
|     Bob|               317|
|Jonathan|               312|
|    Alex|               222|
|   Alice|               120|
|  Donald|                 0|
+--------+------------------+



In [0]:
spark.stop()