In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
.appName('Olist')\
.getOrCreate()

25/08/27 06:25:39 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
#path for our data
hdfs_path = '/data/olist/'

In [48]:
customer_df = spark.read.csv(hdfs_path+'olist_customers_dataset.csv', header = True, inferSchema =True)
geolocation_df = spark.read.csv(hdfs_path+'olist_geolocation_dataset.csv', header = True, inferSchema =True)
order_items_df = spark.read.csv(hdfs_path+'olist_order_items_dataset.csv', header = True, inferSchema =True)
order_payments_df = spark.read.csv(hdfs_path + 'olist_order_payments_dataset.csv' ,header = True, inferSchema = True)
order_reviews_df = spark.read.csv(hdfs_path+'olist_order_reviews_dataset.csv', header = True, inferSchema =True)
orders_df = spark.read.csv(hdfs_path+'olist_orders_dataset.csv', header = True, inferSchema =True)
products_df = spark.read.csv(hdfs_path+'olist_products_dataset.csv', header = True, inferSchema =True)
sellers_df = spark.read.csv(hdfs_path+'olist_sellers_dataset.csv', header = True, inferSchema =True)
category_name_df = spark.read.csv(hdfs_path+'product_category_name_translation.csv', header = True, inferSchema =True)

                                                                                

In [49]:
from pyspark.sql.functions import *

In [50]:
def missing_values(df, df_name):
    print(f'missing values in {df_name}: ')
    df.select([count(when(col(c).isNull(),1)).alias(c) for c in df.columns]).show()

In [51]:
missing_values(customer_df,'customer')

missing values in customer: 
+-----------+------------------+------------------------+-------------+--------------+
|customer_id|customer_unique_id|customer_zip_code_prefix|customer_city|customer_state|
+-----------+------------------+------------------------+-------------+--------------+
|          0|                 0|                       0|            0|             0|
+-----------+------------------+------------------------+-------------+--------------+



## Handeling missing values
1. Drop missing values (for non critical columns)
2. Fill missign values (mostly done for numerical columns)
3. impute missing values (for continuous data)

In [52]:
orders_df_cleaned = orders_df.na.drop(subset=['order_id', 'customer_id', 'order_status'])

In [53]:
orders_df_cleaned.show()

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|     2018-07-24 20:41:37|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|  

In [54]:
orders_df_cleaned = orders_df.fillna({'order_delivered_customer_date':'9999:12:31'})

In [55]:
orders_df_cleaned.show()

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|     2018-07-24 20:41:37|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|  

In [56]:
from pyspark.ml.feature import Imputer

In [57]:
imputer = Imputer(inputCols = ['payment_value'], outputCols=['payment_value_imputed']).setStrategy('mean')
payments_df_cleaned = imputer.fit(order_payments_df).transform(order_payments_df)

In [58]:
payments_df_cleaned.show()

+--------------------+------------------+------------+--------------------+-------------+---------------------+
|            order_id|payment_sequential|payment_type|payment_installments|payment_value|payment_value_imputed|
+--------------------+------------------+------------+--------------------+-------------+---------------------+
|b81ef226f3fe1789b...|                 1| credit_card|                   8|        99.33|                99.33|
|a9810da82917af2d9...|                 1| credit_card|                   1|        24.39|                24.39|
|25e8ea4e93396b6fa...|                 1| credit_card|                   1|        65.71|                65.71|
|ba78997921bbcdc13...|                 1| credit_card|                   8|       107.78|               107.78|
|42fdf880ba16b47b5...|                 1| credit_card|                   2|       128.45|               128.45|
|298fcdf1f73eb413e...|                 1| credit_card|                   2|        96.12|               

## Standardizing the format

In [59]:
def print_schema(df, df_name):
    print(f'schema of {df_name}: ')
    df.printSchema()

In [60]:
print_schema(orders_df, 'orders')

schema of orders: 
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)



In [61]:
print_schema(customer_df, 'customer')

schema of customer: 
root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: integer (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [62]:
print_schema(order_payments_df,'payments')

schema of payments: 
root
 |-- order_id: string (nullable = true)
 |-- payment_sequential: integer (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: integer (nullable = true)
 |-- payment_value: double (nullable = true)



In [63]:
orders_df.show()

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|     2018-07-24 20:41:37|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|  

In [64]:
orders_df_cleaned = orders_df_cleaned.withColumn('order_purchase_timestamp', to_date(col('order_purchase_timestamp')))
orders_df_cleaned.show()

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|              2017-10-02|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|              2018-07-24|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|  

In [None]:
order_payments_df.show()

In [36]:
order_payments_df.show()

+--------------------+------------------+------------+--------------------+-------------+
|            order_id|payment_sequential|payment_type|payment_installments|payment_value|
+--------------------+------------------+------------+--------------------+-------------+
|b81ef226f3fe1789b...|                 1| credit_card|                   8|        99.33|
|a9810da82917af2d9...|                 1| credit_card|                   1|        24.39|
|25e8ea4e93396b6fa...|                 1| credit_card|                   1|        65.71|
|ba78997921bbcdc13...|                 1| credit_card|                   8|       107.78|
|42fdf880ba16b47b5...|                 1| credit_card|                   2|       128.45|
|298fcdf1f73eb413e...|                 1| credit_card|                   2|        96.12|
|771ee386b001f0620...|                 1| credit_card|                   1|        81.16|
|3d7239c394a212faa...|                 1| credit_card|                   3|        51.84|
|1f78449c8

In [67]:
from pyspark.sql.functions import when, col

order_payments_df_cleaned = order_payments_df.withColumn(
    'payment_type',
    when(col('payment_type') == 'boleto', 'Bank_transfer').otherwise(col('payment_type'))
)

In [68]:
order_payments_df_cleaned.show()

+--------------------+------------------+-------------+--------------------+-------------+
|            order_id|payment_sequential| payment_type|payment_installments|payment_value|
+--------------------+------------------+-------------+--------------------+-------------+
|b81ef226f3fe1789b...|                 1|  credit_card|                   8|        99.33|
|a9810da82917af2d9...|                 1|  credit_card|                   1|        24.39|
|25e8ea4e93396b6fa...|                 1|  credit_card|                   1|        65.71|
|ba78997921bbcdc13...|                 1|  credit_card|                   8|       107.78|
|42fdf880ba16b47b5...|                 1|  credit_card|                   2|       128.45|
|298fcdf1f73eb413e...|                 1|  credit_card|                   2|        96.12|
|771ee386b001f0620...|                 1|  credit_card|                   1|        81.16|
|3d7239c394a212faa...|                 1|  credit_card|                   3|        51.84|

In [70]:
customer_df.show()

+--------------------+--------------------+------------------------+--------------------+--------------+
|         customer_id|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|
+--------------------+--------------------+------------------------+--------------------+--------------+
|06b8999e2fba1a1fb...|861eff4711a542e4b...|                   14409|              franca|            SP|
|18955e83d337fd6b2...|290c77bc529b7ac93...|                    9790|sao bernardo do c...|            SP|
|4e7b3e00288586ebd...|060e732b5b29e8181...|                    1151|           sao paulo|            SP|
|b2b6027bc5c5109e5...|259dac757896d24d7...|                    8775|     mogi das cruzes|            SP|
|4f2d8ab171c80ec83...|345ecd01c38d18a90...|                   13056|            campinas|            SP|
|879864dab9bc30475...|4c93744516667ad3b...|                   89254|      jaragua do sul|            SC|
|fd826e7cf63160e53...|addec96d2e059c80c...|            

In [71]:
customer_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: integer (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



As here the zip code is a int value but we can't leave it as this because if someone bymistakenly performed any operations on this then it'll cause an issue, so we will convert the column data type into string

In [73]:
customer_df_cleaned = customer_df.withColumn('customer_zip_code_prefix', col('customer_zip_code_prefix').cast(StringType())) 

In [74]:
customer_df_cleaned.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [75]:
customer_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: integer (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [80]:
customer_df.count()

99441

## Remove Duplicate records

In [77]:
customer_df_cleaned = customer_df_cleaned.dropDuplicates(['customer_id'])

In [81]:
customer_df_cleaned.count()

                                                                                

99441

As during exploration part we concluded no duplicate data in the dataset

## Merging tables

In [88]:
order_with_details = orders_df.join(order_items_df,'order_id','left')\
.join(customer_df_cleaned, 'customer_id', 'left')\
.join(order_payments_df_cleaned, 'order_id', 'left')

In [89]:
order_with_details.show(5)

[Stage 114:>                                                        (0 + 1) / 1]

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+--------------------+------------------------+-------------+--------------+------------------+-------------+--------------------+-------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|order_item_id|          product_id|           seller_id|shipping_limit_date|price|freight_value|  customer_unique_id|customer_zip_code_prefix|customer_city|customer_state|payment_sequential| payment_type|payment_installments|payment_value|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+----------

                                                                                

In [90]:
order_with_total_values = order_with_details.groupBy('order_id')\
.agg(sum('payment_value').alias('total_order_value'))

In [91]:
order_with_total_values.show(5)



+--------------------+-----------------+
|            order_id|total_order_value|
+--------------------+-----------------+
|1c4a92d82c1b0dec1...|747.1500000000001|
|28eaf054725f4dd3c...|             45.0|
|78cd965d0bc0388d3...|           189.37|
|126f2d9c30f82426d...|           155.01|
|d6d7c431275f0029d...|           161.71|
+--------------------+-----------------+
only showing top 5 rows



                                                                                

# Advanced transformation

## Removing outliers

In [92]:

order_items_df.show()

+--------------------+-------------+--------------------+--------------------+-------------------+------+-------------+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_date| price|freight_value|
+--------------------+-------------+--------------------+--------------------+-------------------+------+-------------+
|00010242fe8c5a6d1...|            1|4244733e06e7ecb49...|48436dade18ac8b2b...|2017-09-19 09:45:35|  58.9|        13.29|
|00018f77f2f0320c5...|            1|e5f2d52b802189ee6...|dd7ddc04e1b6c2c61...|2017-05-03 11:05:13| 239.9|        19.93|
|000229ec398224ef6...|            1|c777355d18b72b67a...|5b51032eddd242adc...|2018-01-18 14:48:30| 199.0|        17.87|
|00024acbcdf0a6daa...|            1|7634da152a4610f15...|9d7a1d34a50524090...|2018-08-15 10:10:18| 12.99|        12.79|
|00042b26cf59d7ce6...|            1|ac6c3623068f30de0...|df560393f3a51e745...|2017-02-13 13:57:51| 199.9|        18.14|
|00048cc3ae777c65d...|            1|ef92

In [93]:
quantiles = order_items_df.approxQuantile('price', [0.01, 0.99], 0.0)
low_cutoff,high_cutoff = quantiles[0], quantiles[1]

                                                                                

In [94]:
low_cutoff,high_cutoff

(9.99, 890.0)

In [95]:
#removing ourliers
order_items_df_cleaned = order_items_df.filter((col('price') >= low_cutoff) & (col('price')<= high_cutoff))

In [97]:
order_items_df.select('price').summary().show()



+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|            112650|
|   mean|120.65373901471354|
| stddev|183.63392805026012|
|    min|              0.85|
|    25%|              39.9|
|    50%|             74.99|
|    75%|             134.9|
|    max|            6735.0|
+-------+------------------+



                                                                                

In [98]:
order_items_df_cleaned.select('price').summary().show()



+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|            110453|
|   mean|108.49213068006871|
| stddev|112.87303173792675|
|    min|              9.99|
|    25%|             39.99|
|    50%|              74.9|
|    75%|             130.0|
|    max|             890.0|
+-------+------------------+



[Stage 127:>                                                        (0 + 1) / 1]                                                                                

Now we can see the values are being removed which are potential outliers

In [99]:
products_df.show(5)

+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|          product_id|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|
+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|1e9e8ef04dbcff454...|           perfumaria|                 40|                       287|                 1|             225|               16|               10|              14|
|3aa071139cb16b67c...|                artes|                 44|                       276|                 1|            1000|               30|               18|              20|
|96bd76ec8810374ed...|        esporte_lazer|                 46|                       250|    

Here we can specify values according to range of product weight

In [101]:
products_df_cleaned  = products_df.withColumn('product_weight_g', \
                                              when(col('product_weight_g')< 500, 'small')
                                             .when(col('product_weight_g').between(500, 2000), 'Medium')
                                             .otherwise('Large')
                                             )

In [102]:
products_df_cleaned.show(5)

+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|          product_id|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|
+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|1e9e8ef04dbcff454...|           perfumaria|                 40|                       287|                 1|           small|               16|               10|              14|
|3aa071139cb16b67c...|                artes|                 44|                       276|                 1|          Medium|               30|               18|              20|
|96bd76ec8810374ed...|        esporte_lazer|                 46|                       250|    

In [103]:
!hadoop fs -ls /data/olist/

Found 9 items
-rw-r--r--   2 sibashishsatipali hadoop    9033957 2025-08-21 20:26 /data/olist/olist_customers_dataset.csv
-rw-r--r--   2 sibashishsatipali hadoop   61273883 2025-08-21 20:26 /data/olist/olist_geolocation_dataset.csv
-rw-r--r--   2 sibashishsatipali hadoop   15438671 2025-08-21 20:26 /data/olist/olist_order_items_dataset.csv
-rw-r--r--   2 sibashishsatipali hadoop    5777138 2025-08-21 20:26 /data/olist/olist_order_payments_dataset.csv
-rw-r--r--   2 sibashishsatipali hadoop   14451670 2025-08-21 20:26 /data/olist/olist_order_reviews_dataset.csv
-rw-r--r--   2 sibashishsatipali hadoop   17654914 2025-08-21 20:26 /data/olist/olist_orders_dataset.csv
-rw-r--r--   2 sibashishsatipali hadoop    2379446 2025-08-21 20:26 /data/olist/olist_products_dataset.csv
-rw-r--r--   2 sibashishsatipali hadoop     174703 2025-08-21 20:26 /data/olist/olist_sellers_dataset.csv
-rw-r--r--   2 sibashishsatipali hadoop       2613 2025-08-21 20:26 /data/olist/product_category_name_translation.c

In [104]:
!hadoop fs -mkdir /data/olist_processed

In [105]:
order_with_details.write.mode('overwrite').parquet('/data/olist_processed/cleaned_data.parquet')

                                                                                

In [107]:
!hadoop fs -ls /data/olist_processed

Found 1 items
drwxr-xr-x   - root hadoop          0 2025-08-27 19:26 /data/olist_processed/cleaned_data.parquet


In [108]:
!hadoop fs -mv /data/olist_processed/cleaned_data.parquet /data/olist_processed/detailed_order.parquet

In [109]:
products_df_cleaned.write.mode('overwrite').parquet('/data/olist_processed/cleaned_products.parquet')

                                                                                

In [110]:
!hadoop fs -ls /data/olist_processed

Found 2 items
drwxr-xr-x   - root hadoop          0 2025-08-27 19:30 /data/olist_processed/cleaned_products.parquet
drwxr-xr-x   - root hadoop          0 2025-08-27 19:26 /data/olist_processed/detailed_order.parquet


In [1]:
spark.stop()