In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Olist').getOrCreate()

25/05/24 21:49:57 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
hdfs_path = '/data/olist/'

In [4]:
customers_df = spark.read.csv(hdfs_path + 'olist_customers_dataset.csv', inferSchema='true', header='true')
geolocation_df = spark.read.csv(hdfs_path + 'olist_geolocation_dataset.csv', inferSchema='true', header='true')
order_items_df = spark.read.csv(hdfs_path + 'olist_order_items_dataset.csv', inferSchema='true', header='true')
order_payments_df = spark.read.csv(hdfs_path + 'olist_order_payments_dataset.csv', inferSchema='true', header='true')
order_reviews_df = spark.read.csv(hdfs_path + 'olist_order_reviews_dataset.csv', inferSchema='true', header='true')
orders_df = spark.read.csv(hdfs_path + 'olist_orders_dataset.csv', inferSchema='true', header='true')
products_df = spark.read.csv(hdfs_path + 'olist_products_dataset.csv', inferSchema='true', header='true')
sellers_df = spark.read.csv(hdfs_path + 'olist_sellers_dataset.csv', inferSchema='true', header='true')
product_category_translation_df = spark.read.csv(hdfs_path + 'product_category_name_translation.csv', inferSchema='true', header='true')

                                                                                

In [5]:
# caching frequently used datasets for faster access & better performance

orders_df.cache()
products_df.cache()
customers_df.cache()
sellers_df.cache()
order_items_df.cache()

DataFrame[order_id: string, order_item_id: int, product_id: string, seller_id: string, shipping_limit_date: timestamp, price: double, freight_value: double]

In [6]:
joined_df = orders_df.join(order_items_df, 'order_id', 'inner')

In [7]:
joined_df = joined_df.join(products_df, 'product_id', 'inner')

In [8]:
joined_df = joined_df.join(sellers_df, 'seller_id', 'inner')

In [9]:
joined_df = joined_df.join(customers_df, 'customer_id', 'inner')

In [10]:
# left joining geolocation data
# because dont want to lose data without geolocation
joined_df = joined_df.join(geolocation_df, joined_df.customer_zip_code_prefix == geolocation_df.geolocation_zip_code_prefix, 'left')

In [12]:
# left joining reviews and payments
joined_df = joined_df.join(order_reviews_df, 'order_id', 'left')
complete_df = joined_df.join(order_payments_df, 'order_id', 'left')

In [13]:
complete_df.cache()

25/05/23 23:16:22 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


DataFrame[order_id: string, customer_id: string, seller_id: string, product_id: string, order_status: string, order_purchase_timestamp: timestamp, order_approved_at: timestamp, order_delivered_carrier_date: timestamp, order_delivered_customer_date: timestamp, order_estimated_delivery_date: timestamp, order_item_id: int, shipping_limit_date: timestamp, price: double, freight_value: double, product_category_name: string, product_name_lenght: int, product_description_lenght: int, product_photos_qty: int, product_weight_g: int, product_length_cm: int, product_height_cm: int, product_width_cm: int, seller_zip_code_prefix: int, seller_city: string, seller_state: string, customer_unique_id: string, customer_zip_code_prefix: int, customer_city: string, customer_state: string, geolocation_zip_code_prefix: int, geolocation_lat: double, geolocation_lng: double, geolocation_city: string, geolocation_state: string, review_id: string, review_score: string, review_comment_title: string, review_commen

In [14]:
complete_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_name_lenght: integer (nullable = true)
 |-- product_description_lenght: integer (nullable = true)
 |-- product_photos_qty: integer (nullable = true)
 |-- product_weight_g: integer (nullable = true)
 |-- product_length_cm: integer (null

In [29]:
from pyspark.sql.functions import *

# total revenue per seller
seller_revenue_df = complete_df.groupBy('seller_id').agg(sum('price').alias('total_profit')).orderBy('total_profit', ascending = False)
seller_revenue_df.show(5)



+--------------------+--------------------+
|           seller_id|        total_profit|
+--------------------+--------------------+
|4869f7a5dfa277a7d...|3.6138717319999926E7|
|53243585a1d6dc264...| 3.429159294999998E7|
|4a3ca9315b744ce9f...|  3.37595708400001E7|
|7c67e1448b00f6e96...|3.2282321789999772E7|
|fa1c13f2614d7b5c4...|3.0139386309999976E7|
+--------------------+--------------------+
only showing top 5 rows



                                                                                

In [34]:
# Total Items Orders Per Custoemr
customer_total_items = complete_df.groupBy('customer_id').agg(count('order_item_id').alias('total_items_ordered')).orderBy('total_items_ordered', ascending = False)

In [35]:
customer_total_items.show(5)



+--------------------+-------------------+
|         customer_id|total_items_ordered|
+--------------------+-------------------+
|351e40989da90e704...|              11427|
|50920f8cd0681fd86...|              10752|
|9b43e2a62de9bab3a...|               8556|
|270c23a11d024a44c...|               8001|
|5c87184371002d49e...|               6876|
+--------------------+-------------------+
only showing top 5 rows



                                                                                

In [40]:
# Total  Orders Per Custoemr
customer_total_items = complete_df.groupBy('customer_id').agg(count('order_id').alias('total_orders')).orderBy('total_orders', ascending = False)

In [41]:
customer_total_items.show(5)



+--------------------+------------+
|         customer_id|total_orders|
+--------------------+------------+
|351e40989da90e704...|       11427|
|50920f8cd0681fd86...|       10752|
|9b43e2a62de9bab3a...|        8556|
|270c23a11d024a44c...|        8001|
|5c87184371002d49e...|        6876|
+--------------------+------------+
only showing top 5 rows



                                                                                

In [42]:
# Average Review Score Per Seller
seller_avg_review_rating = complete_df.groupBy('seller_id').agg(avg('review_score').alias('review_score')).orderBy(desc('review_score'))

In [43]:
seller_avg_review_rating.show(5)



+--------------------+------------+
|           seller_id|review_score|
+--------------------+------------+
|417a1e6c7321084d2...|         5.0|
|48e5ee06fb2dc74df...|         5.0|
|b5dd7151a92ccaaa2...|         5.0|
|a663d9c3797e90eac...|         5.0|
|10264f60a8f0a4d2f...|         5.0|
+--------------------+------------+
only showing top 5 rows



                                                                                

In [44]:
# Most Sold Products ( Top 10 )
most_sold_prod = complete_df.groupBy('product_id').agg(count('order_id').alias('count')).orderBy(desc('count'))

In [45]:
most_sold_prod.show(5)



+--------------------+-----+
|          product_id|count|
+--------------------+-----+
|aca2eb7d00ea1a7b8...|86740|
|422879e10f4668299...|81110|
|99a4788cb24856965...|78775|
|389d119b48cf3043d...|60248|
|d1c427060a0f73f6b...|59274|
+--------------------+-----+
only showing top 5 rows



                                                                                

In [52]:
# Top Customers By Spending
top_spending_cust = complete_df.groupBy('customer_id').agg(sum('payment_value').alias('total_spending')).orderBy(desc('total_spending'))

In [47]:
top_spending_cust.show(5)



+--------------------+--------------+
|         customer_id|total_spending|
+--------------------+--------------+
|d3e82ccec3cb5f956...|     6662844.0|
|df55c14d1476a9a34...|     3565657.0|
|fe5113a38e3575c04...|     3293604.0|
|ec5b2ba62e5743423...|     2556120.0|
|63b964e79dee32a35...|     2501664.0|
+--------------------+--------------+
only showing top 5 rows



                                                                                

In [53]:
top_spending_cust.show(5)



+--------------------+--------------------+
|         customer_id|      total_spending|
+--------------------+--------------------+
|1ff773612ab8934db...| 1.756825199999893E7|
|05455dfa7cd02f13d...|1.3282083359999327E7|
|ec5b2ba62e5743423...|1.0388528640000112E7|
|0c792d32a3251b4f6...|   8254681.600000529|
|78fc46047c4a639e8...|   7488519.999999339|
+--------------------+--------------------+
only showing top 5 rows



                                                                                

## Optimizing joins using broadcast

In [6]:
from pyspark.sql.functions import *

In [7]:
joined_df = orders_df.join(order_items_df, 'order_id', 'inner')

In [8]:
joined_df = joined_df.join(products_df, 'product_id', 'inner')

In [9]:
joined_df = joined_df.join(broadcast(sellers_df), 'seller_id', 'inner')

In [10]:
joined_df = joined_df.join(customers_df, 'customer_id', 'inner')

In [12]:
# join geolocation using broadcast
joined_df = joined_df.join(broadcast(geolocation_df), joined_df.customer_zip_code_prefix == geolocation_df.geolocation_zip_code_prefix, 'left')

In [13]:
# join reviews using broadcast
joined_df = joined_df.join(broadcast(order_reviews_df), 'order_id', 'left')

In [14]:
# join payments
complete_df = joined_df.join(order_payments_df, 'order_id', 'left')

using broadcast helps in making the joins faster and more efficient
when one of the dataset is small, so instead of shuffling everything just the small dataset is sent across the network

In [15]:
complete_df.cache()

25/05/24 21:57:48 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


DataFrame[order_id: string, customer_id: string, seller_id: string, product_id: string, order_status: string, order_purchase_timestamp: timestamp, order_approved_at: timestamp, order_delivered_carrier_date: timestamp, order_delivered_customer_date: timestamp, order_estimated_delivery_date: timestamp, order_item_id: int, shipping_limit_date: timestamp, price: double, freight_value: double, product_category_name: string, product_name_lenght: int, product_description_lenght: int, product_photos_qty: int, product_weight_g: int, product_length_cm: int, product_height_cm: int, product_width_cm: int, seller_zip_code_prefix: int, seller_city: string, seller_state: string, customer_unique_id: string, customer_zip_code_prefix: int, customer_city: string, customer_state: string, geolocation_zip_code_prefix: int, geolocation_lat: double, geolocation_lng: double, geolocation_city: string, geolocation_state: string, review_id: string, review_score: string, review_comment_title: string, review_commen

### Utilzing window function

In [16]:
from pyspark.sql.window import Window

In [19]:
# calculating top 5 expensive products per seller
window_spec = Window.partitionBy('seller_id').orderBy(desc('price'))

In [25]:
# top 5 expensive products per seller
top_seller_products_df = complete_df.withColumn('rank', rank().over(window_spec)).filter(col('rank') <=5)
top_seller_products_df.select('seller_id', 'price', 'rank').show(5)



+--------------------+-----+----+
|           seller_id|price|rank|
+--------------------+-----+----+
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
+--------------------+-----+----+
only showing top 5 rows



                                                                                

In [36]:
# most recent order per customer
window_spec = Window.partitionBy('customer_id').orderBy(desc('order_purchase_timestamp'))

recent_orders_df = orders_df.withColumn('row_num', row_number().over(window_spec))\
                            .filter(col('row_num') <= 2)

recent_orders_df.select('customer_id', 'order_id', 'order_purchase_timestamp').show(5)

+--------------------+--------------------+------------------------+
|         customer_id|            order_id|order_purchase_timestamp|
+--------------------+--------------------+------------------------+
|00012a2ce6f8dcda2...|5f79b5b0931d63f1a...|     2017-11-14 16:08:26|
|000161a058600d590...|a44895d095d7e0702...|     2017-07-16 09:40:32|
|000379cdec6255224...|0ab7fb08086d4af91...|     2018-04-02 13:42:17|
|0004164d20a9e969a...|cd3558a10d854487b...|     2017-04-12 08:35:12|
|000419c5494106c30...|07f6c3baf9ac86865...|     2018-03-02 17:47:40|
+--------------------+--------------------+------------------------+
only showing top 5 rows



## Calcuating complex detailed metrics

### **Customers**

In [98]:
# total revenue & Average Order Value per Customer

customer_spending_df = complete_df.groupBy('customer_id').agg(
count('order_id').alias('total_orders'),
sum('price').alias('total_spent'),
round(avg('price'),2).alias('AOV')).orderBy(desc('total_spent'))

customer spending habits are helpful in deciding various promotions for specific customers

In [99]:
customer_spending_df.show(5)



+--------------------+------------+-----------+------+
|         customer_id|total_orders|total_spent|   AOV|
+--------------------+------------+-----------+------+
|d3e82ccec3cb5f956...|        6876|  6662844.0| 969.0|
|df55c14d1476a9a34...|         743|  3565657.0|4799.0|
|fe5113a38e3575c04...|        2292|  3293604.0|1437.0|
|ec5b2ba62e5743423...|        1428|  2556120.0|1790.0|
|63b964e79dee32a35...|        6072|  2501664.0| 412.0|
+--------------------+------------+-----------+------+
only showing top 5 rows



                                                                                

### **Sellers**

In [42]:
# revenue, avg review, order count

seller_performance_df = complete_df.groupBy('seller_id').agg(
count('order_id').alias('total_orders'),
sum('price').alias('total_revenue'),
round(avg('review_score'),2).alias('avg_review'),
round(stddev('price'),2).alias('price_variability')).orderBy(desc('total_revenue'))

In [43]:
seller_performance_df.show(5)



+--------------------+------------+--------------------+----------+-----------------+
|           seller_id|total_orders|       total_revenue|avg_review|price_variability|
+--------------------+------------+--------------------+----------+-----------------+
|4869f7a5dfa277a7d...|      184587| 3.613871731999314E7|      4.09|           111.65|
|53243585a1d6dc264...|       54514|3.4291592950000696E7|      4.12|           499.65|
|4a3ca9315b744ce9f...|      330661| 3.375957084001202E7|      3.77|            59.37|
|7c67e1448b00f6e96...|      233306|3.2282321790021457E7|      3.42|            50.39|
|fa1c13f2614d7b5c4...|       87686|3.0139386310006626E7|      4.38|            307.7|
+--------------------+------------+--------------------+----------+-----------------+
only showing top 5 rows



                                                                                

total orders give the sampele size, while total revenue is used to establish the importance of seller and avg review gives us the idea about seller, with price variability letting us know if the seller offers frequent dicsount 

### **Product**

In [44]:
# purchase count, total purchase value, avg price of the product, variability in price of the product, list of sellers offering the product

product_performance_df = complete_df.groupBy('product_id').agg(
count('order_id').alias('total_sales'),
sum('price').alias('total_revenue'),
round(avg('price'),2).alias('avg_price'),
round(stddev('price'),2).alias('price_variability'),
collect_set('seller_id').alias('unique_sellers')).orderBy(desc('total_sales'))

In [45]:
product_performance_df.show(5)



+--------------------+-----------+-----------------+---------+-----------------+--------------------+
|          product_id|total_sales|    total_revenue|avg_price|price_variability|      unique_sellers|
+--------------------+-----------+-----------------+---------+-----------------+--------------------+
|aca2eb7d00ea1a7b8...|      86740|6164630.299996104|    71.07|             3.17|[955fee9216a65b61...|
|422879e10f4668299...|      81110|4442791.509997975|    54.77|             4.46|[1f50f920176fa81d...|
|99a4788cb24856965...|      78775|6921762.709996365|    87.87|             4.08|[4a3ca9315b744ce9...|
|389d119b48cf3043d...|      60248| 3280533.12999878|    54.45|             4.37|[1f50f920176fa81d...|
|d1c427060a0f73f6b...|      59274|8220103.330003085|   138.68|            16.58|[a1043bafd471dff5...|
+--------------------+-----------+-----------------+---------+-----------------+--------------------+
only showing top 5 rows



                                                                                

### **Monthly**

In [60]:
monthly_performance_df = complete_df.groupBy(month('order_purchase_timestamp').alias('month')).agg(
count('order_id').alias('total_orders'),
sum('price').alias('total_revenue'),
round(avg('price'),2).alias('avg_order_value'),
median('price').alias('median_order_value'),
min('price').alias('min_order_value'),
max('price').alias('max_order_value'),
round(stddev('price'),2).alias('price_variability')).orderBy('month')

In [61]:
monthly_performance_df.show()



+-----+------------+--------------------+---------------+------------------+---------------+---------------+-----------------+
|month|total_orders|       total_revenue|avg_order_value|median_order_value|min_order_value|max_order_value|price_variability|
+-----+------------+--------------------+---------------+------------------+---------------+---------------+-----------------+
|    1|     1495580|1.7153290149996296E8|         114.69|              78.0|            2.9|         3690.0|           157.84|
|    2|     1551163|1.7878178407014424E8|         115.26|              72.0|           2.99|         6735.0|           167.19|
|    3|     1809467|2.1868116843072027E8|         120.85|             73.87|            4.9|        4099.99|           174.76|
|    4|     1693860|2.1715696913056687E8|          128.2|              79.9|           0.85|         4799.0|            226.5|
|    5|     1918571| 2.400611519711523E8|         125.12|              75.0|            3.5|         6499.0|   

                                                                                

### **Customer Retention Analysis**

In [66]:
customer_retention_df = complete_df.groupBy('customer_id').agg(
min('order_purchase_timestamp').alias('first_order_date'),
max('order_purchase_timestamp').alias('last_order_date'),
count('order_id').alias('total_orders'),
round(avg('price'),2).alias('AOV')).orderBy(desc('total_orders'))

In [71]:
customer_retention_df.show(5)



+--------------------+-------------------+-------------------+------------+-----+
|         customer_id|   first_order_date|    last_order_date|total_orders|  AOV|
+--------------------+-------------------+-------------------+------------+-----+
|bc6063f0f2cde26be...|2018-06-10 21:23:13|2018-06-10 21:23:13|           1| 39.0|
|7e9e8e8df9e2f28df...|2018-05-18 22:36:51|2018-05-18 22:36:51|           1| 69.0|
|52f7baf30ea546558...|2017-05-11 13:22:02|2017-05-11 13:22:02|           1|399.0|
|d5fb71112470ab15d...|2018-03-23 10:56:08|2018-03-23 10:56:08|           1| 99.9|
|42da09831872a4ecc...|2018-07-25 21:29:05|2018-07-25 21:29:05|           1| 76.0|
+--------------------+-------------------+-------------------+------------+-----+
only showing top 5 rows



                                                                                

## **Enriching data**

In [73]:
# creating order status flags
complete_df = complete_df.withColumn('is_delivered', when(col('order_status') == 'delivered', lit(1)).otherwise(lit(0)))\
.withColumn('is_canceled', when(col('order_status') == 'canceled', lit(1)).otherwise(lit(0)))

In [76]:
complete_df.select('order_id', 'order_status','is_delivered','is_canceled').show(10)

+--------------------+------------+------------+-----------+
|            order_id|order_status|is_delivered|is_canceled|
+--------------------+------------+------------+-----------+
|00010242fe8c5a6d1...|   delivered|           1|          0|
|00010242fe8c5a6d1...|   delivered|           1|          0|
|00010242fe8c5a6d1...|   delivered|           1|          0|
|00010242fe8c5a6d1...|   delivered|           1|          0|
|00010242fe8c5a6d1...|   delivered|           1|          0|
|00010242fe8c5a6d1...|   delivered|           1|          0|
|00010242fe8c5a6d1...|   delivered|           1|          0|
|00010242fe8c5a6d1...|   delivered|           1|          0|
|00010242fe8c5a6d1...|   delivered|           1|          0|
|00010242fe8c5a6d1...|   delivered|           1|          0|
+--------------------+------------+------------+-----------+
only showing top 10 rows



In [75]:
complete_df.where(complete_df['order_status'] == 'canceled').select('order_id','order_status','is_delivered','is_canceled').show(10)

+--------------------+------------+------------+-----------+
|            order_id|order_status|is_delivered|is_canceled|
+--------------------+------------+------------+-----------+
|00310b0c75bb13015...|    canceled|           0|          1|
|00310b0c75bb13015...|    canceled|           0|          1|
|00310b0c75bb13015...|    canceled|           0|          1|
|00310b0c75bb13015...|    canceled|           0|          1|
|00310b0c75bb13015...|    canceled|           0|          1|
|00310b0c75bb13015...|    canceled|           0|          1|
|00310b0c75bb13015...|    canceled|           0|          1|
|00310b0c75bb13015...|    canceled|           0|          1|
|00310b0c75bb13015...|    canceled|           0|          1|
|00310b0c75bb13015...|    canceled|           0|          1|
+--------------------+------------+------------+-----------+
only showing top 10 rows



In [78]:
complete_df = complete_df.withColumn('order_revenue', col('price') + col('freight_value'))
complete_df.select('order_id', 'price', 'freight_value', 'order_revenue').show(5)

+--------------------+-----+-------------+-------------+
|            order_id|price|freight_value|order_revenue|
+--------------------+-----+-------------+-------------+
|00010242fe8c5a6d1...| 58.9|        13.29|        72.19|
|00010242fe8c5a6d1...| 58.9|        13.29|        72.19|
|00010242fe8c5a6d1...| 58.9|        13.29|        72.19|
|00010242fe8c5a6d1...| 58.9|        13.29|        72.19|
|00010242fe8c5a6d1...| 58.9|        13.29|        72.19|
+--------------------+-----+-------------+-------------+
only showing top 5 rows



In [100]:
customer_spending_df = customer_spending_df.withColumn('customer_segment', 
                                                       when(col('AOV') >= 1200, 'High-value')
                                                      .when(col('AOV') <= 700, 'Low-value')
                                                      .otherwise('Medium-value'))

In [101]:
customer_spending_df.show(5)



+--------------------+------------+-----------+------+----------------+
|         customer_id|total_orders|total_spent|   AOV|customer_segment|
+--------------------+------------+-----------+------+----------------+
|d3e82ccec3cb5f956...|        6876|  6662844.0| 969.0|    Medium-value|
|df55c14d1476a9a34...|         743|  3565657.0|4799.0|      High-value|
|fe5113a38e3575c04...|        2292|  3293604.0|1437.0|      High-value|
|ec5b2ba62e5743423...|        1428|  2556120.0|1790.0|      High-value|
|63b964e79dee32a35...|        6072|  2501664.0| 412.0|       Low-value|
+--------------------+------------+-----------+------+----------------+
only showing top 5 rows



                                                                                

integrating this into our main dataset

In [82]:
complete_df = complete_df.join(customer_spending_df.select('customer_id', 'customer_segment'), 'customer_id', how = 'left')
complete_df.show(5)

                                                                                

+--------------------+--------------------+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+-------------------+-----+-------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+----------------------+-------------+------------+--------------------+------------------------+--------------------+--------------+---------------------------+-------------------+------------------+--------------------+-----------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+------------------+------------+--------------------+-------------+------------+-----------+-------------+----------------+
|         customer_id|            order_id|           seller_id|       

In [85]:
# weekend vs weekday orders

complete_df = complete_df.withColumn('order_day_type',
                                   when(dayofweek('order_purchase_timestamp').isin(1,7), lit('Weekend')).otherwise(lit('Weekday')))
complete_df.select('order_purchase_timestamp','order_day_type').show()

+------------------------+--------------+
|order_purchase_timestamp|order_day_type|
+------------------------+--------------+
|     2017-09-13 08:59:02|       Weekday|
|     2017-09-13 08:59:02|       Weekday|
|     2017-09-13 08:59:02|       Weekday|
|     2017-09-13 08:59:02|       Weekday|
|     2017-09-13 08:59:02|       Weekday|
|     2017-09-13 08:59:02|       Weekday|
|     2017-09-13 08:59:02|       Weekday|
|     2017-09-13 08:59:02|       Weekday|
|     2017-09-13 08:59:02|       Weekday|
|     2017-09-13 08:59:02|       Weekday|
|     2017-09-13 08:59:02|       Weekday|
|     2017-09-13 08:59:02|       Weekday|
|     2017-09-13 08:59:02|       Weekday|
|     2017-09-13 08:59:02|       Weekday|
|     2017-09-13 08:59:02|       Weekday|
|     2017-09-13 08:59:02|       Weekday|
|     2017-09-13 08:59:02|       Weekday|
|     2017-09-13 08:59:02|       Weekday|
|     2017-09-13 08:59:02|       Weekday|
|     2017-09-13 08:59:02|       Weekday|
+------------------------+--------

In [91]:
complete_df.select('freight_value').orderBy(desc('freight_value')).show()

+-------------+
|freight_value|
+-------------+
|       409.68|
|       409.68|
|       409.68|
|       409.68|
|       409.68|
|       409.68|
|       409.68|
|       409.68|
|       409.68|
|       409.68|
|       409.68|
|       409.68|
|       409.68|
|       409.68|
|       409.68|
|       409.68|
|       409.68|
|       409.68|
|       409.68|
|       409.68|
+-------------+
only showing top 20 rows



In [103]:
# freight category

freight_df = complete_df.select('customer_id', 'freight_value').withColumn('freight_category', 
                                                       when(col('freight_value') >= 350, 'High-value')
                                                      .when(col('freight_value') <= 50, 'Low-value')
                                                      .otherwise('Medium-value'))

In [104]:
freight_df.show(5)

+--------------------+-------------+----------------+
|         customer_id|freight_value|freight_category|
+--------------------+-------------+----------------+
|3ce436f183e68e078...|        13.29|       Low-value|
|3ce436f183e68e078...|        13.29|       Low-value|
|3ce436f183e68e078...|        13.29|       Low-value|
|3ce436f183e68e078...|        13.29|       Low-value|
|3ce436f183e68e078...|        13.29|       Low-value|
+--------------------+-------------+----------------+
only showing top 5 rows



In [105]:
customer_spending_df = customer_spending_df.join(freight_df.select('customer_id','freight_category'), 'customer_id', how = 'left')
customer_spending_df.show(5)

[Stage 167:>                                                        (0 + 1) / 1]

+--------------------+------------+-----------+------+----------------+----------------+
|         customer_id|total_orders|total_spent|   AOV|customer_segment|freight_category|
+--------------------+------------+-----------+------+----------------+----------------+
|136f8c475ac7abd11...|          14|    2099.86|149.99|       Low-value|    Medium-value|
|136f8c475ac7abd11...|          14|    2099.86|149.99|       Low-value|    Medium-value|
|136f8c475ac7abd11...|          14|    2099.86|149.99|       Low-value|    Medium-value|
|136f8c475ac7abd11...|          14|    2099.86|149.99|       Low-value|    Medium-value|
|136f8c475ac7abd11...|          14|    2099.86|149.99|       Low-value|    Medium-value|
+--------------------+------------+-----------+------+----------------+----------------+
only showing top 5 rows



                                                                                

In [107]:
# Order Volume by Customer State

order_volume_by_state = complete_df.groupBy('customer_state').agg(countDistinct('order_id').alias('total_orders')).orderBy('customer_state')
order_volume_by_state.show()



+--------------+------------+
|customer_state|total_orders|
+--------------+------------+
|            AC|          81|
|            AL|         411|
|            AM|         147|
|            AP|          68|
|            BA|        3358|
|            CE|        1327|
|            DF|        2125|
|            ES|        2025|
|            GO|        2007|
|            MA|         740|
|            MG|       11544|
|            MS|         709|
|            MT|         903|
|            PA|         970|
|            PB|         532|
|            PE|        1648|
|            PI|         493|
|            PR|        4998|
|            RJ|       12762|
|            RN|         482|
+--------------+------------+
only showing top 20 rows



                                                                                

In [108]:
!hadoop fs -mkdir /data/olist/olist-merged/

In [110]:
complete_df.write.mode('overwrite').parquet('/data/olist/olist-merged')

                                                                                