In [4]:
import pandas as pd


orders = pd.read_csv("/content/olist_orders_dataset.csv")
items = pd.read_csv("/content/olist_order_items_dataset.csv")
customers = pd.read_csv("/content/olist_customers_dataset.csv")
reviews = pd.read_csv("/content/olist_order_reviews_dataset.csv")
products = pd.read_csv("/content/olist_products_dataset.csv")


In [5]:

date_cols = ['order_purchase_timestamp', 'order_approved_at',
             'order_delivered_carrier_date', 'order_delivered_customer_date',
             'order_estimated_delivery_date']
for col in date_cols:
    orders[col] = pd.to_datetime(orders[col])


In [6]:
# customers into orders
orders_full = pd.merge(orders, customers, on='customer_id', how='left')

# items into orders
orders_full = pd.merge(orders_full, items, on='order_id', how='left')

# product metadata
orders_full = pd.merge(orders_full, products, on='product_id', how='left')

# Merge reviews
reviews_clean = reviews.drop_duplicates(subset='order_id')  # one review per order
orders_full = pd.merge(orders_full, reviews_clean[['order_id', 'review_score']], on='order_id', how='left')


In [7]:
# Delivery delay in days
orders_full['delivery_delay'] = (
    orders_full['order_delivered_customer_date'] - orders_full['order_estimated_delivery_date']
).dt.days

# Total value (price + freight)
orders_full['total_order_value'] = orders_full['price'] + orders_full['freight_value']


In [9]:
orders_full.head()



Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,customer_unique_id,customer_zip_code_prefix,...,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,review_score,delivery_delay,total_order_value
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,7c396fd4830fd04220f754e42b4e5bff,3149,...,40.0,268.0,4.0,500.0,19.0,8.0,13.0,4.0,-8.0,38.71
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13,af07308b275d755c9edb36a90c618231,47813,...,29.0,178.0,1.0,400.0,19.0,13.0,19.0,4.0,-6.0,141.46
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04,3a653a41f6f9fc3d2a113cf8398680e8,75265,...,46.0,232.0,1.0,420.0,24.0,19.0,21.0,5.0,-18.0,179.12
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15,7c142cf63193a1473d2e66489a9ae977,59296,...,59.0,468.0,3.0,450.0,30.0,10.0,20.0,5.0,-13.0,72.2
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26,72632f0f9dd73dfee390c9b22eb56dd6,9195,...,38.0,316.0,4.0,250.0,51.0,15.0,15.0,5.0,-10.0,28.62


In [10]:
from google.colab import files

# Save temporarily in session
orders_full.to_csv("orders_full.csv", index=False)

# Trigger download to your system
files.download("orders_full.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>