In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/itv020649/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
spark

In [3]:
orderDf = spark.read \
.format("csv") \
.option("inferSchema","true") \
.option("header","true") \
.load("/public/trendytech/orders_wh/*")

In [4]:
orderDf.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [5]:
orderDf.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



## Higher Level API demo

In [6]:
orderDf.createOrReplaceTempView("orders")

In [7]:
orderDf.createOrReplaceGlobalTempView("orders")

#### 1. find top 15 customer who place most no. of order

In [8]:
result = orderDf.groupBy("customer_id").count().sort("count",ascending = False).limit(15)

In [9]:
result.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       5897|   16|
|      12431|   16|
|        569|   16|
|       6316|   16|
|      12284|   15|
|       4320|   15|
|       5624|   15|
|       5283|   15|
|        221|   15|
|       5654|   15|
|       6248|   14|
|       3708|   14|
|       1011|   14|
|       8652|   14|
|       4517|   14|
+-----------+-----+



In [10]:
result = spark.sql("select customer_id, count(order_id) as count from orders group by customer_id order by count desc limit 15")

In [11]:
result.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       5897|   16|
|      12431|   16|
|        569|   16|
|       6316|   16|
|      12284|   15|
|       4320|   15|
|       5624|   15|
|       5283|   15|
|        221|   15|
|       5654|   15|
|       6248|   14|
|       3708|   14|
|       1011|   14|
|       8652|   14|
|       4517|   14|
+-----------+-----+



#### 2. no. of orders under esch order status

In [15]:
orderDf.groupby("order_status").count()

order_status,count
PENDING_PAYMENT,15030
COMPLETE,22899
ON_HOLD,3798
PAYMENT_REVIEW,729
PROCESSING,8275
CLOSED,7556
SUSPECTED_FRAUD,1558
PENDING,7610
CANCELED,1428


#### 3. no. of active customer(who place atleast one order)

In [24]:
orderDf.select("customer_id").distinct().count()
#select is like MAP
# count is transformation or action groupby+count = xfr, count=action

12405

#### 4. customers with most no. of closed orders

In [19]:
orderDf.filter("order_status = 'CLOSED'").groupby("customer_id").count().sort("count",ascending = False).limit(10)

customer_id,count
1833,6
1363,5
1687,5
5493,5
9804,4
2236,4
7850,4
12431,4
2403,4
1443,4


In [22]:
spark.sql("select customer_id,count(*) as count from orders where order_status = 'CLOSED' group by 1 order by count desc limit 10")

customer_id,count
1833,6
5493,5
1363,5
1687,5
8630,4
7879,4
7948,4
5319,4
9804,4
9830,4
