# Sales Data

Siny P Raphel

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import hour, to_timestamp, concat_ws, collect_list, col, count


In [0]:
spark = SparkSession.builder.master('local').appName('sales').getOrCreate()

In [0]:
sales = spark.read.csv('/FileStore/tables/sales_data-1.csv', inferSchema=True, header=True).drop('_c0')
sales.show(2)

+--------+--------------------+----------------+----------+--------------+--------------------+-----+------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|Month|        City|
+--------+--------------------+----------------+----------+--------------+--------------------+-----+------------+
|  176558|USB-C Charging Cable|               2|     11.95|04/19/19 08:46|917 1st St, Dalla...|    4|Dallas  (TX)|
|  176559|Bose SoundSport H...|               1|     99.99|04/07/19 22:30|682 Chestnut St, ...|    4|Boston  (MA)|
+--------+--------------------+----------------+----------+--------------+--------------------+-----+------------+
only showing top 2 rows



In [0]:
sales.printSchema()

root
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)
 |-- Month: integer (nullable = true)
 |-- City: string (nullable = true)



In [0]:
sales = sales.withColumn('Date', to_timestamp('Order Date', 'MM/dd/yy HH:mm')).drop('Order Date')
sales =sales.withColumn('hour', hour('Date'))
sales = sales.withColumn('sales', sales['Price Each'] * sales['Quantity Ordered'])
sales.show(5)

+--------+--------------------+----------------+----------+--------------------+-----+-----------------+-------------------+----+-----+
|Order ID|             Product|Quantity Ordered|Price Each|    Purchase Address|Month|             City|               Date|hour|sales|
+--------+--------------------+----------------+----------+--------------------+-----+-----------------+-------------------+----+-----+
|  176558|USB-C Charging Cable|               2|     11.95|917 1st St, Dalla...|    4|     Dallas  (TX)|2019-04-19 08:46:00|   8| 23.9|
|  176559|Bose SoundSport H...|               1|     99.99|682 Chestnut St, ...|    4|     Boston  (MA)|2019-04-07 22:30:00|  22|99.99|
|  176560|        Google Phone|               1|     600.0|669 Spruce St, Lo...|    4|Los Angeles  (CA)|2019-04-12 14:38:00|  14|600.0|
|  176560|    Wired Headphones|               1|     11.99|669 Spruce St, Lo...|    4|Los Angeles  (CA)|2019-04-12 14:38:00|  14|11.99|
|  176561|    Wired Headphones|               1|

In [0]:
sales.printSchema()

root
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Purchase Address: string (nullable = true)
 |-- Month: integer (nullable = true)
 |-- City: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- sales: double (nullable = true)



•    What was the best month for sales? How much was earned that month?

In [0]:
sales.groupBy('month').sum('sales').orderBy('sum(sales)', ascending=False).select('month').show(1)

+-----+
|month|
+-----+
|   12|
+-----+
only showing top 1 row



* What city sold the most product?

In [0]:
sales.groupBy('City').sum('Quantity Ordered').orderBy('sum(Quantity Ordered)', ascending=False).select('City').show(1)

+-------------------+
|               City|
+-------------------+
|San Francisco  (CA)|
+-------------------+
only showing top 1 row



•    What time should we display advertisemens to maximize the likelihood of customer’s buying product?

In [0]:
sales.groupBy('hour').sum('Quantity Ordered').orderBy('sum(Quantity Ordered)', ascending=False).show(1)

+----+---------------------+
|hour|sum(Quantity Ordered)|
+----+---------------------+
|  19|                14470|
+----+---------------------+
only showing top 1 row



•    What products are most often sold together?
•    What product sold the most? Why do you think it sold the most

In [0]:
grouped_prd = sales.groupBy('Order ID').agg(count('Order ID').alias('count'), concat_ws(',', collect_list(sales.Product)).alias('prd'))
grouped_prd.show(10)

+--------+-----+--------------------+
|Order ID|count|                 prd|
+--------+-----+--------------------+
|  141234|    1|              iPhone|
|  141239|    1|AAA Batteries (4-...|
|  141241|    1|USB-C Charging Cable|
|  141242|    1|Bose SoundSport H...|
|  141243|    1|Apple Airpods Hea...|
|  141246|    1|AAA Batteries (4-...|
|  141247|    1|    27in FHD Monitor|
|  141250|    1|     Vareebadd Phone|
|  141251|    1|Apple Airpods Hea...|
|  141252|    1|USB-C Charging Cable|
+--------+-----+--------------------+
only showing top 10 rows



In [0]:
grouped_prd.where(grouped_prd['count'] > 1).groupBy('prd').count().orderBy('count', ascending=False).select('prd').show(1, False)

+-------------------------------+
|prd                            |
+-------------------------------+
|iPhone,Lightning Charging Cable|
+-------------------------------+
only showing top 1 row



Alternate way without string conversion

In [0]:
sales.groupBy('Order ID').agg(count('Order ID'), collect_list('Product').alias('coll_prd')).where(col('count(Order ID)') > 1).groupBy('coll_prd').count().orderBy('count', ascending=False).show(1, False)

+----------------------------------+-----+
|coll_prd                          |count|
+----------------------------------+-----+
|[iPhone, Lightning Charging Cable]|882  |
+----------------------------------+-----+
only showing top 1 row



In [0]:
sales.where(sales['Order ID'] == 160873).show(2)

+--------+--------------------+----------------+----------+--------------------+-----+-------------------+-------------------+----+-----+
|Order ID|             Product|Quantity Ordered|Price Each|    Purchase Address|Month|               City|               Date|hour|sales|
+--------+--------------------+----------------+----------+--------------------+-----+-------------------+-------------------+----+-----+
|  160873|              iPhone|               1|     700.0|702 River St, San...|    2|San Francisco  (CA)|2019-02-10 10:24:00|  10|700.0|
|  160873|Lightning Chargin...|               1|     14.95|702 River St, San...|    2|San Francisco  (CA)|2019-02-10 10:24:00|  10|14.95|
+--------+--------------------+----------------+----------+--------------------+-----+-------------------+-------------------+----+-----+
only showing top 2 rows



•    What product sold the most? Why do you think it sold the most

In [0]:
sales.groupBy('Product').sum('Quantity Ordered').orderBy('sum(Quantity Ordered)', ascending=False).select('Product').show(1, False)

+----------------------+
|Product               |
+----------------------+
|AAA Batteries (4-pack)|
+----------------------+
only showing top 1 row



In [0]:
from pyspark.sql.functions import collect_set

In [0]:
sales.select(collect_set('Product')).show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|collect_set(Product)                                                                                                                                                                                                                                                                                                                                                            |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------