In [13]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [14]:
spark

In [15]:
orders_df = spark.read.\
format("csv").\
option("header","true").\
option("inferSchema", "True").\
load("/public/trendytech/orders_wh/*")

In [16]:
orders_df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [17]:
orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [22]:
orders_df.createOrReplaceTempView("orders")

## Higher Level API's Demo

#### 1. Top 15 customers who placed the most number of orders

In [18]:
result = orders_df.groupBy("customer_id").count().sort("count", ascending = False).limit(15)

In [19]:
result.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       5897|   16|
|      12431|   16|
|        569|   16|
|       6316|   16|
|       5624|   15|
|      12284|   15|
|       4320|   15|
|       5283|   15|
|        221|   15|
|       5654|   15|
|        791|   14|
|       4517|   14|
|       3708|   14|
|       8652|   14|
|       4249|   14|
+-----------+-----+



In [23]:
result = spark.sql("select customer_id, count(order_id) as count from orders group by customer_id order by count desc limit 15")

In [25]:
result.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|        569|   16|
|       5897|   16|
|      12431|   16|
|       6316|   16|
|      12284|   15|
|       5654|   15|
|        221|   15|
|       5624|   15|
|       4320|   15|
|       5283|   15|
|       4517|   14|
|       3708|   14|
|       4249|   14|
|       3710|   14|
|        791|   14|
+-----------+-----+



#### 2. Find the number of of orders under each order status

In [None]:
result = orders_df.groupBy("order_status").count()

In [26]:
result.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|      12431|   16|
|       5897|   16|
|        569|   16|
|       6316|   16|
|       4320|   15|
|        221|   15|
|      12284|   15|
|       5624|   15|
|       5283|   15|
|       5654|   15|
|       4249|   14|
|       3708|   14|
|      11689|   14|
|       3710|   14|
|        791|   14|
+-----------+-----+



In [31]:
result = spark.sql("select order_status, count(order_id) as count from orders group by order_status")

In [32]:
result.show()

+---------------+-----+
|   order_status|count|
+---------------+-----+
|PENDING_PAYMENT|15030|
|       COMPLETE|22899|
|        ON_HOLD| 3798|
| PAYMENT_REVIEW|  729|
|     PROCESSING| 8275|
|         CLOSED| 7556|
|SUSPECTED_FRAUD| 1558|
|        PENDING| 7610|
|       CANCELED| 1428|
+---------------+-----+



#### 3. No of active customers who placed atleast one order

In [33]:
result = orders_df.select("customer_id").distinct().count()

In [35]:
print(result)

12405


In [37]:
result = spark.sql("select count(distinct (customer_id)) as active_customers from orders")

In [38]:
print(result)

+----------------+
|active_customers|
+----------------+
|           12405|
+----------------+



#### 4. Customer with most number of closed orders

In [42]:
result = orders_df.filter("order_status = 'CLOSED'").groupBy ('customer_id').count().sort("count", ascending = False)

In [43]:
result.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       1833|    6|
|       1363|    5|
|       1687|    5|
|       5493|    5|
|       7948|    4|
|       2768|    4|
|      10263|    4|
|       7850|    4|
|       2403|    4|
|        437|    4|
|       4573|    4|
|       3631|    4|
|      12431|    4|
|       1521|    4|
|      10111|    4|
|       7879|    4|
|      10018|    4|
|       5319|    4|
|       2236|    4|
|       2774|    4|
+-----------+-----+
only showing top 20 rows



In [45]:
result= spark.sql("select customer_id, count(order_id) as count from orders where order_status = 'CLOSED' group by customer_id order by count desc")

In [46]:
result.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       1833|    6|
|       1363|    5|
|       1687|    5|
|       5493|    5|
|       7948|    4|
|       2768|    4|
|      10263|    4|
|       2236|    4|
|       2403|    4|
|        437|    4|
|       4573|    4|
|       7850|    4|
|      12431|    4|
|       1521|    4|
|      10111|    4|
|       7879|    4|
|      10018|    4|
|       5319|    4|
|       2774|    4|
|       3631|    4|
+-----------+-----+
only showing top 20 rows

