In [49]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import Row, Window

In [50]:
spark = SparkSession.builder \
    .master("local") \
    .appName("t") \
    .getOrCreate()

In [51]:
 product_df = spark.read.parquet("DatasetToCompleteTheSixSparkExercises/products_parquet")

In [52]:
product_df.show()

+----------+------------+-----+
|product_id|product_name|price|
+----------+------------+-----+
|         0|   product_0|   22|
|         1|   product_1|   30|
|         2|   product_2|   91|
|         3|   product_3|   37|
|         4|   product_4|  145|
|         5|   product_5|  128|
|         6|   product_6|   66|
|         7|   product_7|  145|
|         8|   product_8|   51|
|         9|   product_9|   44|
|        10|  product_10|   53|
|        11|  product_11|   13|
|        12|  product_12|  104|
|        13|  product_13|  102|
|        14|  product_14|   24|
|        15|  product_15|   14|
|        16|  product_16|   38|
|        17|  product_17|   72|
|        18|  product_18|   16|
|        19|  product_19|   46|
+----------+------------+-----+
only showing top 20 rows



In [53]:
 sale_df = spark.read.parquet("DatasetToCompleteTheSixSparkExercises/sales_parquet")

In [54]:
sale_df.show()

+--------+----------+---------+----------+---------------+--------------------+
|order_id|product_id|seller_id|      date|num_pieces_sold|       bill_raw_text|
+--------+----------+---------+----------+---------------+--------------------+
|       1|         0|        0|2020-07-10|             26|kyeibuumwlyhuwksx...|
|       2|         0|        0|2020-07-08|             13|jfyuoyfkeyqkckwbu...|
|       3|         0|        0|2020-07-05|             38|uyjihlzhzcswxcccx...|
|       4|         0|        0|2020-07-05|             56|umnxvoqbdzpbwjqmz...|
|       5|         0|        0|2020-07-05|             11|zmqexmaawmvdpqhih...|
|       6|         0|        0|2020-07-01|             82|lmuhhkpyuoyslwmvX...|
|       7|         0|        0|2020-07-04|             15|zoqweontumefxbgvu...|
|       8|         0|        0|2020-07-08|             79|sgldfgtcxufasnvsc...|
|       9|         0|        0|2020-07-10|             25|jnykelwjjebgkwgmu...|
|      10|         0|        0|2020-07-0

In [55]:
 saller_df = spark.read.parquet("DatasetToCompleteTheSixSparkExercises/sellers_parquet")

In [56]:
saller_df.show()

+---------+-----------+------------+
|seller_id|seller_name|daily_target|
+---------+-----------+------------+
|        0|   seller_0|     2500000|
|        1|   seller_1|      257237|
|        2|   seller_2|      754188|
|        3|   seller_3|      310462|
|        4|   seller_4|     1532808|
|        5|   seller_5|     1199693|
|        6|   seller_6|     1055915|
|        7|   seller_7|     1946998|
|        8|   seller_8|      547320|
|        9|   seller_9|     1318051|
+---------+-----------+------------+



### Task-2: How many distinct products are sold each day?

In [58]:
 g_df = sale_df.groupby(col("date")).agg(countDistinct(col("product_id")).alias("distinct_sold_products")).orderBy(
    col("distinct_sold_products").desc()).show()

+----------+----------------------+
|      date|distinct_sold_products|
+----------+----------------------+
|2020-07-06|                100765|
|2020-07-09|                100501|
|2020-07-01|                100337|
|2020-07-03|                100017|
|2020-07-02|                 99807|
|2020-07-05|                 99796|
|2020-07-04|                 99791|
|2020-07-07|                 99756|
|2020-07-08|                 99662|
|2020-07-10|                 98973|
+----------+----------------------+



### Task-3: What is the average revenue of the orders?

In [63]:
sale_df.join(product_df, sale_df["product_id"] == product_df["product_id"], "inner").agg(avg(product_df["price"] * sale_df["num_pieces_sold"])).alias("average_revenue_orders").show()

+------------------------------+
|avg((price * num_pieces_sold))|
+------------------------------+
|            1246.1338560822878|
+------------------------------+



### Task-4: What is the average daily revenue of each product?

In [66]:
sale_df.join(product_df, sale_df["product_id"] == product_df["product_id"], "inner").groupby(product_df['product_id'],sale_df["date"]).agg(avg(product_df["price"] * sale_df["num_pieces_sold"]).alias('Average revenue of orders by Product ID')).show()

+----------+----------+---------------------------------------+
|product_id|      date|Average revenue of orders by Product ID|
+----------+----------+---------------------------------------+
|  10000047|2020-07-07|                                  551.0|
|  10000715|2020-07-04|                                 3825.0|
|  10002110|2020-07-09|                                12125.0|
|  10004929|2020-07-03|                                 1394.0|
|  10005243|2020-07-04|                                 4312.0|
|  10005267|2020-07-06|                                 1387.0|
|  10005605|2020-07-08|                                   67.0|
|  10007641|2020-07-08|                                 4664.0|
|   1000879|2020-07-09|                                 1400.0|
|  10009135|2020-07-08|                                 1944.0|
|  10010167|2020-07-05|                                   69.0|
|  10010700|2020-07-06|                                 6210.0|
|  10011268|2020-07-03|                 

### Task-5: For each seller, what is the average % contribution of an order to the sellers daily quota?

In [67]:
print(sale_df.join(broadcast(saller_df), sale_df["seller_id"] == saller_df["seller_id"], "inner").withColumn(
    "ratio", sale_df["num_pieces_sold"]/saller_df["daily_target"]
).groupBy(sale_df["seller_id"]).agg(avg("ratio")).show())

+---------+--------------------+
|seller_id|          avg(ratio)|
+---------+--------------------+
|        0|2.019885898946922...|
|        7|2.595228787788170...|
|        3| 1.62888537056594E-4|
|        8|9.213030375408861E-5|
|        5|4.211073965904022E-5|
|        6|4.782147194369122E-5|
|        9|3.837913136180238E-5|
|        1|1.964233366461014...|
|        4|3.296428039825817E-5|
|        2|6.690408001060484E-5|
+---------+--------------------+

None


### Task-6: Who are the second most selling and the least selling persons (sellers) for each product? 

In [40]:
sales_table = sale_df.groupby(col("product_id"), col("seller_id")).agg(sum("num_pieces_sold").alias("num_pieces_sold")) 

In [41]:
window_desc = Window.partitionBy(col("product_id")).orderBy(col("num_pieces_sold").desc())
window_asc = Window.partitionBy(col("product_id")).orderBy(col("num_pieces_sold").asc())

In [42]:
sales_table = sales_table.withColumn("rank_asc", dense_rank().over(window_asc)). \
    withColumn("rank_desc", dense_rank().over(window_desc))

single_seller = sales_table.where(col("rank_asc") == col("rank_desc")).select(
    col("product_id").alias("single_seller_product_id"), col("seller_id").alias("single_seller_seller_id"),
    lit("Only seller or multiple sellers with the same results").alias("type")
)

In [69]:

second_seller = sales_table.where(col("rank_desc") == 2).select(
    col("product_id").alias("second_seller_product_id"), col("seller_id").alias("second_seller_seller_id"),
    lit("Second top seller").alias("type")
)





In [70]:
# Get the least sellers and exclude those rows that are already included in the first piece

least_seller = sales_table.where(col("rank_asc") == 1).select(
    col("product_id"), col("seller_id"),
    lit("Least Seller").alias("type")).join(single_seller, (sales_table["seller_id"] == single_seller["single_seller_seller_id"]) & (
        sales_table["product_id"] == single_seller["single_seller_product_id"]), "left_anti"). \
    join(second_seller, (sales_table["seller_id"] == second_seller["second_seller_seller_id"]) & (
        sales_table["product_id"] == second_seller["second_seller_product_id"]), "left_anti")

In [71]:
second_seller.show()


+------------------------+-----------------------+-----------------+
|second_seller_product_id|second_seller_seller_id|             type|
+------------------------+-----------------------+-----------------+
|                  100142|                      9|Second top seller|
|                10030330|                      5|Second top seller|
|                10031766|                      5|Second top seller|
|                10059280|                      7|Second top seller|
|                10063288|                      7|Second top seller|
|                10067998|                      9|Second top seller|
|                10074531|                      9|Second top seller|
|                10079600|                      9|Second top seller|
|                10090404|                      6|Second top seller|
|                10099820|                      3|Second top seller|
|                 1011418|                      4|Second top seller|
|                10140476|        

In [72]:
least_seller.show()

+----------+---------+------------+
|product_id|seller_id|        type|
+----------+---------+------------+
|  19986717|        1|Least Seller|
|  72017876|        1|Least Seller|
|   3534470|        3|Least Seller|
|  35669461|        4|Least Seller|
|  14542470|        5|Least Seller|
|  28592106|        5|Least Seller|
|  34681047|        5|Least Seller|
|  40496308|        5|Least Seller|
|  56011040|        5|Least Seller|
|  67723231|        5|Least Seller|
|  69790381|        5|Least Seller|
|  10978356|        7|Least Seller|
|  18182299|        7|Least Seller|
|  52606213|        7|Least Seller|
|  61475460|        7|Least Seller|
|  17944574|        8|Least Seller|
|  36269838|        8|Least Seller|
|  20774718|        9|Least Seller|
|  31136332|        9|Least Seller|
|  32602520|        9|Least Seller|
+----------+---------+------------+
only showing top 20 rows

