**Step 1: Setting up the Spark Environment**  

Ideally we use some cloud technology to load data directly onto the machine

1. Deploying a Spark Cluster
2. Store the data in HDFS
- it requires downloading data onto to the local machine using
> curl -L -o ~/Downloads/brazilian-ecommerce.zip\\ https://www.kaggle.com/api/v1/datasets/download/olistbr/brazilian-ecommerce

- then moving that data to hadoop cluster using 
> hadoop put
3. Lastly we go on with pyspark to run queries and run analysis


In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('OlistData').getOrCreate()

25/05/21 23:47:26 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
spark

In [6]:
# accessing our files on hadoop
!hadoop fs -ls /data/olist

Found 9 items
-rw-r--r--   2 itswaqas14 hadoop    9033957 2025-05-21 23:39 /data/olist/olist_customers_dataset.csv
-rw-r--r--   2 itswaqas14 hadoop   61273883 2025-05-21 23:39 /data/olist/olist_geolocation_dataset.csv
-rw-r--r--   2 itswaqas14 hadoop   15438671 2025-05-21 23:39 /data/olist/olist_order_items_dataset.csv
-rw-r--r--   2 itswaqas14 hadoop    5777138 2025-05-21 23:39 /data/olist/olist_order_payments_dataset.csv
-rw-r--r--   2 itswaqas14 hadoop   14451670 2025-05-21 23:39 /data/olist/olist_order_reviews_dataset.csv
-rw-r--r--   2 itswaqas14 hadoop   17654914 2025-05-21 23:39 /data/olist/olist_orders_dataset.csv
-rw-r--r--   2 itswaqas14 hadoop    2379446 2025-05-21 23:39 /data/olist/olist_products_dataset.csv
-rw-r--r--   2 itswaqas14 hadoop     174703 2025-05-21 23:39 /data/olist/olist_sellers_dataset.csv
-rw-r--r--   2 itswaqas14 hadoop       2613 2025-05-21 23:39 /data/olist/product_category_name_translation.csv


In [8]:
hdfs_path = '/data/olist/'

In [9]:
customers_df = spark.read.csv(hdfs_path + 'olist_customers_dataset.csv', inferSchema='true', header='true')

                                                                                

In [10]:
customers_df.show(10)

+--------------------+--------------------+------------------------+--------------------+--------------+
|         customer_id|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|
+--------------------+--------------------+------------------------+--------------------+--------------+
|06b8999e2fba1a1fb...|861eff4711a542e4b...|                   14409|              franca|            SP|
|18955e83d337fd6b2...|290c77bc529b7ac93...|                    9790|sao bernardo do c...|            SP|
|4e7b3e00288586ebd...|060e732b5b29e8181...|                    1151|           sao paulo|            SP|
|b2b6027bc5c5109e5...|259dac757896d24d7...|                    8775|     mogi das cruzes|            SP|
|4f2d8ab171c80ec83...|345ecd01c38d18a90...|                   13056|            campinas|            SP|
|879864dab9bc30475...|4c93744516667ad3b...|                   89254|      jaragua do sul|            SC|
|fd826e7cf63160e53...|addec96d2e059c80c...|            

In [12]:
geolocation_df = spark.read.csv(hdfs_path + 'olist_geolocation_dataset.csv', inferSchema='true', header='true')
order_items_df = spark.read.csv(hdfs_path + 'olist_order_items_dataset.csv', inferSchema='true', header='true')
order_payments_df = spark.read.csv(hdfs_path + 'olist_order_payments_dataset.csv', inferSchema='true', header='true')
order_reviews_df = spark.read.csv(hdfs_path + 'olist_order_reviews_dataset.csv', inferSchema='true', header='true')
orders_df = spark.read.csv(hdfs_path + 'olist_orders_dataset.csv', inferSchema='true', header='true')
products_df = spark.read.csv(hdfs_path + 'olist_products_dataset.csv', inferSchema='true', header='true')
sellers_df = spark.read.csv(hdfs_path + 'olist_sellers_dataset.csv', inferSchema='true', header='true')
product_category_translation_df = spark.read.csv(hdfs_path + 'product_category_name_translation.csv', inferSchema='true', header='true')

                                                                                

In [13]:
customers_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: integer (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [14]:
orders_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)



In [17]:
# checking for data leakage
print(f'Customers : {customers_df.count()} rows')
print(f'Orders : {orders_df.count()} rows')
print(f'Products : {products_df.count()} rows')
print(f'Sellers : {sellers_df.count()} rows')

# cross checked the number of rows on the original data source(kaggle) and verified if both are same

Customers : 99441 rows
Orders : 99441 rows
Products : 32951 rows
Sellers : 3095 rows


## Figuring out NULL or duplicate values for each major dataset

### null or empty values

In [19]:
# checking NULL or duplicate

from pyspark.sql.functions import col, when, count

customers_df.select([
    count(when((col(c).isNull()) | (col(c) == 0), 1)).alias(c)
    for c in customers_df.columns
]).show()


+-----------+------------------+------------------------+-------------+--------------+
|customer_id|customer_unique_id|customer_zip_code_prefix|customer_city|customer_state|
+-----------+------------------+------------------------+-------------+--------------+
|          0|                 0|                       0|            0|             0|
+-----------+------------------+------------------------+-------------+--------------+



In [23]:
orders_df.select([
    count(when((col(c).isNull()), 1)).alias(c)
    for c in orders_df.columns
]).show()

+--------+-----------+------------+------------------------+-----------------+----------------------------+-----------------------------+-----------------------------+
|order_id|customer_id|order_status|order_purchase_timestamp|order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------+-----------+------------+------------------------+-----------------+----------------------------+-----------------------------+-----------------------------+
|       0|          0|           0|                       0|              160|                        1783|                         2965|                            0|
+--------+-----------+------------+------------------------+-----------------+----------------------------+-----------------------------+-----------------------------+



In [25]:
sellers_df.select([
    count(when((col(c).isNull()), 1)).alias(c)
    for c in sellers_df.columns
]).show()

+---------+----------------------+-----------+------------+
|seller_id|seller_zip_code_prefix|seller_city|seller_state|
+---------+----------------------+-----------+------------+
|        0|                     0|          0|           0|
+---------+----------------------+-----------+------------+



In [26]:
products_df.select([
    count(when((col(c).isNull()), 1)).alias(c)
    for c in products_df.columns
]).show()

+----------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|product_id|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|
+----------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|         0|                  610|                610|                       610|               610|               2|                2|                2|               2|
+----------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+



### duplicate values

In [27]:
customers_df.groupBy('customer_id').count().filter('count>1').show()



+-----------+-----+
|customer_id|count|
+-----------+-----+
+-----------+-----+



                                                                                

In [28]:
orders_df.groupBy('order_id').count().filter('count>1').show()

+--------+-----+
|order_id|count|
+--------+-----+
+--------+-----+



In [29]:
products_df.groupBy('product_id').count().filter('count>1').show()

+----------+-----+
|product_id|count|
+----------+-----+
+----------+-----+



In [30]:
sellers_df.groupBy('seller_id').count().filter('count>1').show()

+---------+-----+
|seller_id|count|
+---------+-----+
+---------+-----+



## Doing EDA

In [35]:
# customer distribution by state
customers_df.groupBy('customer_state').count().orderBy('count', ascending = False).show()

+--------------+-----+
|customer_state|count|
+--------------+-----+
|            SP|41746|
|            RJ|12852|
|            MG|11635|
|            RS| 5466|
|            PR| 5045|
|            SC| 3637|
|            BA| 3380|
|            DF| 2140|
|            ES| 2033|
|            GO| 2020|
|            PE| 1652|
|            CE| 1336|
|            PA|  975|
|            MT|  907|
|            MA|  747|
|            MS|  715|
|            PB|  536|
|            PI|  495|
|            RN|  485|
|            AL|  413|
+--------------+-----+
only showing top 20 rows



In [37]:
# seller distribution by state
sellers_df.groupBy('seller_state').count().orderBy('count', ascending = False).show(5)

+------------+-----+
|seller_state|count|
+------------+-----+
|          SP| 1849|
|          PR|  349|
|          MG|  244|
|          SC|  190|
|          RJ|  171|
+------------+-----+
only showing top 5 rows



In [40]:
# status of order
orders_df.groupBy('order_status').count().orderBy('count', ascending = False).show()

+------------+-----+
|order_status|count|
+------------+-----+
|   delivered|96478|
|     shipped| 1107|
|    canceled|  625|
| unavailable|  609|
|    invoiced|  314|
|  processing|  301|
|     created|    5|
|    approved|    2|
+------------+-----+



In [39]:
# top 5 product categories
products_df.groupBy('product_category_name').count().orderBy('count', ascending = False).show(5)

+---------------------+-----+
|product_category_name|count|
+---------------------+-----+
|      cama_mesa_banho| 3029|
|        esporte_lazer| 2867|
|     moveis_decoracao| 2657|
|         beleza_saude| 2444|
| utilidades_domest...| 2335|
+---------------------+-----+
only showing top 5 rows



In [41]:
# check most popular payment type
order_payments_df.groupBy('payment_type').count().orderBy('count', ascending = False).show(5)

+------------+-----+
|payment_type|count|
+------------+-----+
| credit_card|76795|
|      boleto|19784|
|     voucher| 5775|
|  debit_card| 1529|
| not_defined|    3|
+------------+-----+



In [53]:
# check product with highest order
from pyspark.sql.functions import sum

most_ordered_products = order_items_df.groupBy('product_id').count().orderBy('count', ascending = False).show(10)

+--------------------+-----+
|          product_id|count|
+--------------------+-----+
|aca2eb7d00ea1a7b8...|  527|
|99a4788cb24856965...|  488|
|422879e10f4668299...|  484|
|389d119b48cf3043d...|  392|
|368c6c730842d7801...|  388|
|53759a2ecddad2bb8...|  373|
|d1c427060a0f73f6b...|  343|
|53b36df67ebb7c415...|  323|
|154e7e31ebfa09220...|  281|
|3dd2a17168ec895c7...|  274|
+--------------------+-----+
only showing top 10 rows



In [46]:
# check product with highest sales
from pyspark.sql.functions import sum

top_revenue_products = order_items_df.groupBy('product_id').agg(sum('price').alias('total_sales'))
top_revenue_products.orderBy('total_sales', ascending = False).show(10)

+--------------------+------------------+
|          product_id|       total_sales|
+--------------------+------------------+
|bb50f2e236e5eea01...|           63885.0|
|6cdd53843498f9289...| 54730.20000000005|
|d6160fb7873f18409...|48899.340000000004|
|d1c427060a0f73f6b...| 47214.51000000006|
|99a4788cb24856965...|43025.560000000085|
|3dd2a17168ec895c7...| 41082.60000000005|
|25c38557cf793876c...| 38907.32000000001|
|5f504b3a1c75b73d6...|37733.899999999994|
|53b36df67ebb7c415...| 37683.42000000001|
|aca2eb7d00ea1a7b8...| 37608.90000000007|
+--------------------+------------------+
only showing top 10 rows



In [54]:
# delivery time (from purchase date to actual delivery date)
from pyspark.sql.functions import datediff

orders_df = orders_df.withColumn('delivery_days', datediff(col('order_delivered_customer_date'),col('order_purchase_timestamp')))
orders_df.orderBy('delivery_days', ascending = False).show(10)

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|delivery_days|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+
|ca07593549f1816d2...|75683a92331068e2d...|   delivered|     2017-02-21 23:31:27|2017-02-23 02:35:15|         2017-03-08 13:47:46|          2017-09-19 14:36:39|          2017-03-22 00:00:00|          210|
|1b3190b2dfa9d789e...|d306426abe5fca15e...|   delivered|     2018-02-23 14:57:35|2018-02-23 15:16:14|         2018-02-26 18:49:07|          2018-09-19 23:24:07|          2018-03-15

In [55]:
# delay time (between actual and expected delivery date)

orders_df = orders_df.withColumn('delay', datediff(col('order_delivered_customer_date'),col('order_purchase_timestamp')))
orders_df.orderBy('delay', ascending = False).show(10)

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+-----+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|delivery_days|delay|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+-----+
|ca07593549f1816d2...|75683a92331068e2d...|   delivered|     2017-02-21 23:31:27|2017-02-23 02:35:15|         2017-03-08 13:47:46|          2017-09-19 14:36:39|          2017-03-22 00:00:00|          210|  210|
|1b3190b2dfa9d789e...|d306426abe5fca15e...|   delivered|     2018-02-23 14:57:35|2018-02-23 15:16:14|         2018-02-26 18:49:07|          2018-09-19 23:24

In [64]:
# montlhy shopping trend
from pyspark.sql.functions import date_format
orders_df = orders_df.withColumn("month", date_format(col("order_purchase_timestamp"), "MM"))
orders_df.groupBy("month").count().orderBy(col('month')).show(12)

+-----+-----+
|month|count|
+-----+-----+
|   01| 8069|
|   02| 8508|
|   03| 9893|
|   04| 9343|
|   05|10573|
|   06| 9412|
|   07|10318|
|   08|10843|
|   09| 4305|
|   10| 4959|
|   11| 7544|
|   12| 5674|
+-----+-----+



In [67]:
# customer base from each state
from pyspark.sql.functions import countDistinct
customers_df.groupBy('customer_state').agg(countDistinct('customer_id').alias('unique_customers')).orderBy('unique_customers', ascending = False).show()

+--------------+----------------+
|customer_state|unique_customers|
+--------------+----------------+
|            SP|           41746|
|            RJ|           12852|
|            MG|           11635|
|            RS|            5466|
|            PR|            5045|
|            SC|            3637|
|            BA|            3380|
|            DF|            2140|
|            ES|            2033|
|            GO|            2020|
|            PE|            1652|
|            CE|            1336|
|            PA|             975|
|            MT|             907|
|            MA|             747|
|            MS|             715|
|            PB|             536|
|            PI|             495|
|            RN|             485|
|            AL|             413|
+--------------+----------------+
only showing top 20 rows



In [70]:
# avg payment by payment type
from pyspark.sql.functions import avg

order_payments_df.groupBy('payment_type').agg(avg('payment_value')).orderBy('avg(payment_value)', ascending = False).show()

+------------+------------------+
|payment_type|avg(payment_value)|
+------------+------------------+
| credit_card|163.31902063935814|
|      boleto| 145.0344354023453|
|  debit_card|142.57017004578168|
|     voucher|  65.7033541125542|
| not_defined|               0.0|
+------------+------------------+



In [71]:
customer_order_counts = orders_df.groupBy("customer_id") \
                                 .agg(count("*").alias("num_orders"))

# classify repeat vs one-time
customer_order_counts = customer_order_counts.withColumn(
    "customer_type",
    when(col("num_orders") == 1, "one-time").otherwise("repeat")
)

# count in each type
customer_order_counts.groupBy("customer_type") \
                     .count() \
                     .show()

+-------------+-----+
|customer_type|count|
+-------------+-----+
|     one-time|99441|
+-------------+-----+



In [75]:
# join review and items
review_items = order_reviews_df.join(order_items_df, on="order_id", how="inner")

# join that with products
review_items = review_items.join(products_df, on="product_id", how="inner")

# join again with category translation
review_items = review_items.join(product_category_translation_df, on="product_category_name", how="left")

review_items.groupBy("product_category_name_english") \
            .agg(avg("review_score").alias("avg_score")) \
            .orderBy("avg_score", ascending=False) \
            .show(5)


+-----------------------------+-----------------+
|product_category_name_english|        avg_score|
+-----------------------------+-----------------+
|            cds_dvds_musicals|4.642857142857143|
|         fashion_childrens...|              4.5|
|         books_general_int...|4.446265938069216|
|         costruction_tools...|4.444444444444445|
|                      flowers|4.419354838709677|
+-----------------------------+-----------------+
only showing top 5 rows

