In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
orders_rdd = spark.sparkContext.textFile("/public/trendytech/retail_db/orders/*")

In [3]:
orders_rdd.take(5)

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE']

In [4]:
map_rdd = orders_rdd.map(lambda x : (x.split(",")[3],1))

In [5]:
map_rdd.take(10)

[('CLOSED', 1),
 ('PENDING_PAYMENT', 1),
 ('COMPLETE', 1),
 ('CLOSED', 1),
 ('COMPLETE', 1),
 ('COMPLETE', 1),
 ('COMPLETE', 1),
 ('PROCESSING', 1),
 ('PENDING_PAYMENT', 1),
 ('PENDING_PAYMENT', 1)]

In [6]:
reduced_rdd = map_rdd.reduceByKey(lambda x,y : x+y)

In [7]:
reduced_rdd.collect()

[('CLOSED', 7556),
 ('CANCELED', 1428),
 ('COMPLETE', 22899),
 ('PENDING_PAYMENT', 15030),
 ('SUSPECTED_FRAUD', 1558),
 ('PENDING', 7610),
 ('ON_HOLD', 3798),
 ('PROCESSING', 8275),
 ('PAYMENT_REVIEW', 729)]

In [8]:
reduced_sorted = reduced_rdd.sortBy(lambda x: x[1])

In [9]:
reduced_sorted.collect()

[('PAYMENT_REVIEW', 729),
 ('CANCELED', 1428),
 ('SUSPECTED_FRAUD', 1558),
 ('ON_HOLD', 3798),
 ('CLOSED', 7556),
 ('PENDING', 7610),
 ('PROCESSING', 8275),
 ('PENDING_PAYMENT', 15030),
 ('COMPLETE', 22899)]

In [10]:
reduced_sorted = reduced_rdd.sortBy(lambda x: x[1], False)

In [11]:
reduced_sorted.collect()

[('COMPLETE', 22899),
 ('PENDING_PAYMENT', 15030),
 ('PROCESSING', 8275),
 ('PENDING', 7610),
 ('CLOSED', 7556),
 ('ON_HOLD', 3798),
 ('SUSPECTED_FRAUD', 1558),
 ('CANCELED', 1428),
 ('PAYMENT_REVIEW', 729)]

In [12]:
customers_mapped = orders_rdd.map(lambda x :(x.split(",")[2],1))

In [13]:
customers_mapped.take(10)

[('11599', 1),
 ('256', 1),
 ('12111', 1),
 ('8827', 1),
 ('11318', 1),
 ('7130', 1),
 ('4530', 1),
 ('2911', 1),
 ('5657', 1),
 ('5648', 1)]

In [14]:
customers_aggregrated = customers_mapped.reduceByKey(lambda x,y : x+y)

In [15]:
customers_aggregrated.take(10)

[('256', 10),
 ('12111', 6),
 ('11318', 6),
 ('7130', 7),
 ('2911', 6),
 ('5657', 12),
 ('9149', 4),
 ('9842', 7),
 ('7276', 5),
 ('9488', 7)]

In [16]:
customers_sorted = customers_aggregrated.sortBy(lambda x : x[1],False)

In [17]:
customers_sorted.take(10)

[('6316', 16),
 ('12431', 16),
 ('569', 16),
 ('5897', 16),
 ('5283', 15),
 ('12284', 15),
 ('5654', 15),
 ('221', 15),
 ('4320', 15),
 ('5624', 15)]

In [18]:
distinct_customers = orders_rdd.map(lambda x : x.split(",")[2]).distinct()

In [19]:
distinct_customers.count()

12405

In [20]:
orders_rdd.count()

68883

In [21]:
filtered_customers = orders_rdd.filter(lambda x : x.split(",")[3]== 'CLOSED')

In [22]:
filtered_customers.take(10)

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '12,2013-07-25 00:00:00.0,1837,CLOSED',
 '18,2013-07-25 00:00:00.0,1205,CLOSED',
 '24,2013-07-25 00:00:00.0,11441,CLOSED',
 '25,2013-07-25 00:00:00.0,9503,CLOSED',
 '37,2013-07-25 00:00:00.0,5863,CLOSED',
 '51,2013-07-25 00:00:00.0,12271,CLOSED',
 '57,2013-07-25 00:00:00.0,7073,CLOSED',
 '61,2013-07-25 00:00:00.0,4791,CLOSED']

In [23]:
mapped_customers = filtered_customers.map(lambda x : (x.split(",")[2],1))

In [26]:
mapped_customers.take(10)

[('11599', 1),
 ('8827', 1),
 ('1837', 1),
 ('1205', 1),
 ('11441', 1),
 ('9503', 1),
 ('5863', 1),
 ('12271', 1),
 ('7073', 1),
 ('4791', 1)]

In [27]:
reduced_customers = mapped_customers.reduceByKey(lambda x,y : x+y)

In [28]:
reduced_customers.take(10)

[('5863', 1),
 ('12271', 2),
 ('7073', 1),
 ('3065', 2),
 ('5116', 2),
 ('8763', 1),
 ('10604', 2),
 ('16', 1),
 ('9055', 3),
 ('10372', 3)]

In [None]:
sorted_customers = reduced_customers.sortBy(lambda x : x[1], False)