In [1]:
from pyspark.sql import SparkSession
import getpass

username = getpass.getuser()

spark = SparkSession.builder \
    .appName("Shubham-Malai") \
    .master("yarn") \
    .config("spark.ui.port", "0") \
    .config("spark.sql.warehouse.dir", f"/user/{username}/warehouse") \
    .enableHiveSupport() \
    .getOrCreate()


In [2]:
spark

In [3]:
base_rdd  = spark.sparkContext.textFile("/public/trendytech/orders/orders.csv")

In [4]:
base_rdd.take(5)

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE']

In [5]:
"quetion 1 : count the orders in each order_status category"

'quetion 1 : count the orders in each order_status category'

In [6]:
mapped_rdd = base_rdd.map(lambda x:(x.split(",")[3],1))

In [7]:
mapped_rdd.take(5)

[('CLOSED', 1),
 ('PENDING_PAYMENT', 1),
 ('COMPLETE', 1),
 ('CLOSED', 1),
 ('COMPLETE', 1)]

In [8]:
reduced_rdd = mapped_rdd.reduceByKey(lambda x,y:x+y)

In [9]:
reduced_rdd.take(5)

[('PENDING', 9512500),
 ('PROCESSING', 10343750),
 ('CANCELED', 1785000),
 ('COMPLETE', 28623750),
 ('SUSPECTED_FRAUD', 1947500)]

In [10]:
"quetion 2 : Find top 10 customers with most number of orders"

'quetion 2 : Find top 10 customers with most number of orders'

In [11]:
mapped_rdd_q2 = base_rdd.map(lambda x:(x.split(",")[2],1))

In [12]:
mapped_rdd_q2.take(5)

[('11599', 1), ('256', 1), ('12111', 1), ('8827', 1), ('11318', 1)]

In [13]:
reduced_rdd_q2 = mapped_rdd_q2.reduceByKey(lambda x,y:x+y)

In [14]:
reduced_rdd_q2.take(5)

[('113', 8750), ('4534', 6250), ('57', 7500), ('2971', 2500), ('3126', 5000)]

In [15]:
sorted_cust_list = reduced_rdd_q2.sortBy(lambda x:x[1], False)

In [16]:
print('Top 10 customers according to the number of orderes placed')
sorted_cust_list.take(10)

Top 10 customers according to the number of orderes placed


[('5897', 20000),
 ('12431', 20000),
 ('6316', 20000),
 ('569', 20000),
 ('4320', 18750),
 ('5283', 18750),
 ('5654', 18750),
 ('5624', 18750),
 ('12284', 18750),
 ('221', 18750)]

In [17]:
"quetion 3 : Distinct count of customers who have placed atleast one order"

'quetion 3 : Distinct count of customers who have placed atleast one order'

In [18]:
mapped_rdd_q3 = base_rdd.map(lambda x:(x.split(",")[2])).distinct()

In [19]:
mapped_rdd_q3.take(5)

['656', '11644', '4320', '8763', '10655']

In [20]:
print(f'Distinct count of customers who have placed atleast one order : {mapped_rdd_q3.count()} ')

Distinct count of customers who have placed atleast one order : 12405 


In [21]:
"quetion 4 : Get the list of customers who have max no of CLOSED orders"

'quetion 4 : Get the list of customers who have max no of CLOSED orders'

In [32]:
mapped_rdd_q4 = base_rdd.map(lambda x:(x.split(",")))

In [35]:
closed_rdd = mapped_rdd_q4.filter(lambda x:x[3] == 'CLOSED')

In [36]:
closed_rdd.take(5)

[['1', '2013-07-25 00:00:00.0', '11599', 'CLOSED'],
 ['4', '2013-07-25 00:00:00.0', '8827', 'CLOSED'],
 ['12', '2013-07-25 00:00:00.0', '1837', 'CLOSED'],
 ['18', '2013-07-25 00:00:00.0', '1205', 'CLOSED'],
 ['24', '2013-07-25 00:00:00.0', '11441', 'CLOSED']]

In [43]:
mapped_closed_rdd = mapped_rdd_q4.map(lambda x:(x[2],1))

In [44]:
mapped_closed_rdd.take(5)

[('11599', 1), ('256', 1), ('12111', 1), ('8827', 1), ('11318', 1)]

In [45]:
reduced_mapped_closed_rdd = mapped_closed_rdd.reduceByKey(lambda x,y:x+y)

In [48]:
sorted_reduced_mapped_closed_rdd = reduced_mapped_closed_rdd.sortBy(lambda x:x[1], False)

In [49]:
sorted_reduced_mapped_closed_rdd.take(5)

[('5897', 20000),
 ('6316', 20000),
 ('569', 20000),
 ('12431', 20000),
 ('4320', 18750)]