In [39]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession.builder \
    .appName("Shubham-M") \
    .master("yarn") \
    .config("spark.ui.port", "0") \
    .config("spark.sql.warehouse.dir", f"/user/{username}/warehouse") \
    .enableHiveSupport() \
    .getOrCreate()

In [40]:
spark

In [41]:
orders_df = spark.read.csv("/public/trendytech/orders_wh/orders_wh.csv", header = "true", inferSchema = "true")
orders_df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [42]:
spark.sql("use malaiDB")

In [43]:
orders_df.createOrReplaceTempView("orders_table")

In [44]:
spark.sql("show tables like 'orders*'")

database,tableName,isTemporary
malaidb,orders,False
malaidb,orders_external,False
,orders_table,True


In [14]:
spark.sql("select * from orders_table")

order_id,order_date,customer_id,order_status
1,2013-07-25 00:00:...,11599,CLOSED
2,2013-07-25 00:00:...,256,PENDING_PAYMENT
3,2013-07-25 00:00:...,12111,COMPLETE
4,2013-07-25 00:00:...,8827,CLOSED
5,2013-07-25 00:00:...,11318,COMPLETE
6,2013-07-25 00:00:...,7130,COMPLETE
7,2013-07-25 00:00:...,4530,COMPLETE
8,2013-07-25 00:00:...,2911,PROCESSING
9,2013-07-25 00:00:...,5657,PENDING_PAYMENT
10,2013-07-25 00:00:...,5648,PENDING_PAYMENT


#### 1.Fetch top 15 customers who placed most no of orders

In [21]:
orders_df.groupBy('customer_id').count().sort('count', ascending = False).show(15)

+-----------+-----+
|customer_id|count|
+-----------+-----+
|        569|   16|
|       5897|   16|
|      12431|   16|
|       6316|   16|
|        221|   15|
|       4320|   15|
|       5654|   15|
|      12284|   15|
|       5283|   15|
|       5624|   15|
|       3708|   14|
|       4517|   14|
|       6248|   14|
|       3710|   14|
|        791|   14|
+-----------+-----+
only showing top 15 rows



In [24]:
spark.sql("select customer_id, count(order_id) as count from orders_table group by customer_id order by count  desc").show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       5897|   16|
|        569|   16|
|      12431|   16|
|       6316|   16|
|       5654|   15|
|       5624|   15|
|      12284|   15|
|        221|   15|
|       4320|   15|
|       5283|   15|
|       4116|   14|
|       3710|   14|
|       4517|   14|
|       3708|   14|
|       6248|   14|
|      10591|   14|
|       8652|   14|
|      11689|   14|
|        791|   14|
|       5821|   14|
+-----------+-----+
only showing top 20 rows



#### 2. Find the no of orders under each status

In [26]:
orders_df.select('order_status').groupBy('order_status').count().show()

+---------------+-----+
|   order_status|count|
+---------------+-----+
|PENDING_PAYMENT|15030|
|       COMPLETE|22899|
|        ON_HOLD| 3798|
| PAYMENT_REVIEW|  729|
|     PROCESSING| 8275|
|         CLOSED| 7556|
|SUSPECTED_FRAUD| 1558|
|        PENDING| 7610|
|       CANCELED| 1428|
+---------------+-----+



In [28]:
spark.sql("select order_status, count(order_status) from orders_table group by order_status").show()

+---------------+-------------------+
|   order_status|count(order_status)|
+---------------+-------------------+
|PENDING_PAYMENT|              15030|
|       COMPLETE|              22899|
|        ON_HOLD|               3798|
| PAYMENT_REVIEW|                729|
|     PROCESSING|               8275|
|         CLOSED|               7556|
|SUSPECTED_FRAUD|               1558|
|        PENDING|               7610|
|       CANCELED|               1428|
+---------------+-------------------+



#### 3. No of active customers (who have placed atleast one order)

In [29]:
orders_df.select('customer_id').distinct().count()

12405

In [31]:
spark.sql("select count(distinct(customer_id)) from orders_table").show()

+---------------------------+
|count(DISTINCT customer_id)|
+---------------------------+
|                      12405|
+---------------------------+



#### 4. Customers with most number of CLOSED orders.

In [53]:
result = orders_df.filter("order_status = 'CLOSED'").groupBy('customer_id').count().sort('count', ascending =False).show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       1833|    6|
|       1363|    5|
|       1687|    5|
|       5493|    5|
|       7948|    4|
|       2768|    4|
|      10263|    4|
|       2236|    4|
|       2403|    4|
|       7879|    4|
|       4573|    4|
|       7850|    4|
|      12431|    4|
|       1521|    4|
|      10111|    4|
|        437|    4|
|      10018|    4|
|       5319|    4|
|       2774|    4|
|       3631|    4|
+-----------+-----+
only showing top 20 rows



In [58]:
spark.sql("select customer_id, count(order_status) as count from orders_table where order_status = 'CLOSED' group by customer_id order by count desc").show()

customer_id,count
1833,6
1363,5
1687,5
5493,5
7948,4
2768,4
10263,4
2236,4
2403,4
7879,4
