In [9]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

In [10]:
schema = StructType([ \
                     StructField("customerID", IntegerType(), True), \
                     StructField("orderID", IntegerType(), True), \
                     StructField("amount", FloatType(), True)])

In [11]:
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()


customers = spark.read.schema(schema).csv("./customer-orders.csv")

In [12]:
customers.printSchema()

root
 |-- customerID: integer (nullable = true)
 |-- orderID: integer (nullable = true)
 |-- amount: float (nullable = true)



In [18]:
amount_spent_by_customers = customers.select("customerID", "amount")

In [19]:
amount_spent_by_customers = amount_spent_by_customers.groupBy("customerID")\
                            .agg(func.round(func.sum("amount"),2).alias("total_amount"))

In [20]:
amount_spent_by_customers = amount_spent_by_customers.sort("total_amount")

In [21]:
finalResult = amount_spent_by_customers.collect()

for result in finalResult:
    print(result)

Row(customerID=45, total_amount=3309.38)
Row(customerID=79, total_amount=3790.57)
Row(customerID=96, total_amount=3924.23)
Row(customerID=23, total_amount=4042.65)
Row(customerID=99, total_amount=4172.29)
Row(customerID=75, total_amount=4178.5)
Row(customerID=36, total_amount=4278.05)
Row(customerID=98, total_amount=4297.26)
Row(customerID=47, total_amount=4316.3)
Row(customerID=77, total_amount=4327.73)
Row(customerID=13, total_amount=4367.62)
Row(customerID=48, total_amount=4384.33)
Row(customerID=49, total_amount=4394.6)
Row(customerID=94, total_amount=4475.57)
Row(customerID=67, total_amount=4505.79)
Row(customerID=50, total_amount=4517.27)
Row(customerID=78, total_amount=4524.51)
Row(customerID=5, total_amount=4561.07)
Row(customerID=57, total_amount=4628.4)
Row(customerID=83, total_amount=4635.8)
Row(customerID=91, total_amount=4642.26)
Row(customerID=74, total_amount=4647.13)
Row(customerID=84, total_amount=4652.94)
Row(customerID=3, total_amount=4659.63)
Row(customerID=12, tota

In [22]:
spark.stop()