In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession \
    .builder \
    .appName('Spark++ Application') \
    .getOrCreate()

## Load datasets

In [3]:
df_products = spark.read.parquet('../data/spark_data/DatasetToCompleteTheSixSparkExercises/products_parquet/')
df_sellers = spark.read.parquet('../data/spark_data/DatasetToCompleteTheSixSparkExercises/sellers_parquet/')
df_sales = spark.read.parquet('../data/spark_data/DatasetToCompleteTheSixSparkExercises/sales_parquet/')

In [4]:
datasets = [df_products, df_sellers, df_sales]

In [5]:
for dataset in datasets:
    dataset.printSchema()
    dataset.show(3)

root
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: string (nullable = true)

+----------+------------+-----+
|product_id|product_name|price|
+----------+------------+-----+
|         0|   product_0|   22|
|         1|   product_1|   30|
|         2|   product_2|   91|
+----------+------------+-----+
only showing top 3 rows

root
 |-- seller_id: string (nullable = true)
 |-- seller_name: string (nullable = true)
 |-- daily_target: string (nullable = true)

+---------+-----------+------------+
|seller_id|seller_name|daily_target|
+---------+-----------+------------+
|        0|   seller_0|     2500000|
|        1|   seller_1|      257237|
|        2|   seller_2|      754188|
+---------+-----------+------------+
only showing top 3 rows

root
 |-- order_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- num_pieces_sold: string (nullable =

## How many distinct products are sold each day?

In [6]:
df_distinct_products = df_sales.groupby(col("date")).agg(countDistinct(col("product_id")).alias("distinct_products_sold")).orderBy(col("distinct_products_sold").desc())

In [7]:
df_distinct_products.printSchema()
df_distinct_products.show()

root
 |-- date: string (nullable = true)
 |-- distinct_products_sold: long (nullable = false)

+----------+----------------------+
|      date|distinct_products_sold|
+----------+----------------------+
|2020-07-06|                100765|
|2020-07-09|                100501|
|2020-07-01|                100337|
|2020-07-03|                100017|
|2020-07-02|                 99807|
|2020-07-05|                 99796|
|2020-07-04|                 99791|
|2020-07-07|                 99756|
|2020-07-08|                 99662|
|2020-07-10|                 98973|
+----------+----------------------+



In [8]:
df_sales.createOrReplaceTempView("PRODUCT_DATA")

In [9]:
products_sale = spark.sql('SELECT date, Count(DISTINCT product_Id) from PRODUCT_DATA GROUP BY date ORDER BY Count(DISTINCT product_Id)')
products_sale.printSchema()
# products_sale.show(3)

root
 |-- date: string (nullable = true)
 |-- count(DISTINCT product_Id): long (nullable = false)



__Number of distinct products sold each day: ???__

## What is the average revenue of the orders?

In [10]:
joinExpression = df_products["product_id"] == df_sales['product_id']
joinType = 'inner'
df_join_products = df_products.join(df_sales, joinExpression, joinType).drop(df_products['product_id'])

In [11]:
df_join_products.printSchema()
# df_join_products.show(3)

root
 |-- product_name: string (nullable = true)
 |-- price: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- num_pieces_sold: string (nullable = true)
 |-- bill_raw_text: string (nullable = true)



In [12]:
df_revenue_per_product = df_join_products.withColumn('revenue_per_product', df_join_products['price'] * df_join_products['num_pieces_sold'])

In [13]:
df_revenue_per_product.printSchema()
# df_revenue_per_product.show(3)

root
 |-- product_name: string (nullable = true)
 |-- price: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- num_pieces_sold: string (nullable = true)
 |-- bill_raw_text: string (nullable = true)
 |-- revenue_per_product: double (nullable = true)



In [16]:
df_total_revenue = df_revenue_per_product.groupby(col('order_id')).agg(sum(col('revenue_per_product')).alias("total_revenue"))

In [17]:
df_total_revenue.printSchema()
# df_total_revenue.show(3)

root
 |-- order_id: string (nullable = true)
 |-- total_revenue: double (nullable = true)



In [18]:
sum_total_revenue = df_total_revenue.groupby().agg(sum('total_revenue'))

In [19]:
sum_total_revenue.printSchema()
# sum_total_revenue.show()

root
 |-- sum(total_revenue): double (nullable = true)



In [23]:
order_count = sum_total_revenue.groupby().agg(count('total_order'))

AnalysisException: cannot resolve '`total_order`' given input columns: [sum(total_revenue)];;
'Aggregate [count('total_order) AS count(total_order)#193]
+- Aggregate [sum(total_revenue#183) AS sum(total_revenue)#189]
   +- Aggregate [order_id#12], [order_id#12, sum(revenue_per_product#139) AS total_revenue#183]
      +- Project [product_name#1, price#2, order_id#12, product_id#13, seller_id#14, date#15, num_pieces_sold#16, bill_raw_text#17, (cast(price#2 as double) * cast(num_pieces_sold#16 as double)) AS revenue_per_product#139]
         +- Project [product_name#1, price#2, order_id#12, product_id#13, seller_id#14, date#15, num_pieces_sold#16, bill_raw_text#17]
            +- Join Inner, (product_id#0 = product_id#13)
               :- Relation[product_id#0,product_name#1,price#2] parquet
               +- Relation[order_id#12,product_id#13,seller_id#14,date#15,num_pieces_sold#16,bill_raw_text#17] parquet


In [None]:
order_count.printSchema()
# total_revenue.show()

In [21]:
avg_revenue = df_total_order.groupBy().mean('total_order')

NameError: name 'df_total_order' is not defined

In [22]:
avg_revenue.printSchema()

NameError: name 'avg_revenue' is not defined

__Average revenue per order: $1246.13__

## What is the average daily revenue of each product?