<a href="https://colab.research.google.com/github/ruan-narici/brazilian-ecommerce-analytics/blob/main/brazilian_ecommerce_analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Desafio de Projeto: Análise de Vendas e Entregas no E-commerce Brasileiro

### Imports

In [1]:
import pandas as pd

### Load datasets

In [3]:
df_customers = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_customers_dataset.csv") #OK
# df_geolocation = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_geolocation_dataset.csv") #EXCLUDE
df_order_items = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_order_items_dataset.csv") #OK
df_order_payments = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_order_payments_dataset.csv") #OK
df_order_reviews = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_order_reviews_dataset.csv") #OK
df_orders = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_orders_dataset.csv") #OK
df_products = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_products_dataset.csv") #OK
df_sellers = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_sellers_dataset.csv") #OK
df_product_category_name_translation = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/product_category_name_translation.csv") #OK

### Define Types

In [4]:
# DateTimeType
df_order_items["shipping_limit_date"] = pd.to_datetime(df_order_items["shipping_limit_date"])
df_order_reviews["review_creation_date"] = pd.to_datetime(df_order_reviews["review_creation_date"])
df_order_reviews["review_answer_timestamp"] = pd.to_datetime(df_order_reviews["review_answer_timestamp"])
df_orders["order_purchase_timestamp"] = pd.to_datetime(df_orders["order_purchase_timestamp"])
df_orders["order_approved_at"] = pd.to_datetime(df_orders["order_approved_at"])
df_orders["order_delivered_carrier_date"] = pd.to_datetime(df_orders["order_delivered_carrier_date"])
df_orders["order_delivered_customer_date"] = pd.to_datetime(df_orders["order_delivered_customer_date"])
df_orders["order_estimated_delivery_date"] = pd.to_datetime(df_orders["order_estimated_delivery_date"])

### Merge datasets

In [32]:
df_merged = df_orders.merge(df_order_items, on="order_id")
df_merged = df_merged.merge(df_customers, on="customer_id")
df_merged = df_merged.merge(df_sellers, on="seller_id")
df_merged = df_merged.merge(df_products, on="product_id")
df_merged = df_merged.merge(df_order_payments, on="order_id")
df_merged = df_merged.merge(df_product_category_name_translation, on="product_category_name")
df_merged = df_merged.merge(df_order_reviews, on="order_id")

df_merged.columns

Index(['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date',
       'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date',
       'price', 'freight_value', 'customer_unique_id',
       'customer_zip_code_prefix', 'customer_city', 'customer_state',
       'seller_zip_code_prefix', 'seller_city', 'seller_state',
       'product_category_name', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm',
       'payment_sequential', 'payment_type', 'payment_installments',
       'payment_value', 'product_category_name_english', 'review_id',
       'review_score', 'review_comment_title', 'review_comment_message',
       'review_creation_date', 'review_answer_timestamp'],
      dtype='object')

### Handling null values

In [21]:
# DateTimeType
df_merged["order_approved_at"] = df_merged["order_approved_at"].fillna('1900-01-01')
df_merged["order_delivered_carrier_date"] = df_merged["order_delivered_carrier_date"].fillna('1900-01-01')
df_merged["order_delivered_customer_date"] = df_merged["order_delivered_customer_date"].fillna('1900-01-01')

# FloatType
df_merged["product_weight_g"] = df_merged["product_weight_g"].fillna('-2')
df_merged["product_length_cm"] = df_merged["product_length_cm"].fillna('-2')
df_merged["product_height_cm"] = df_merged["product_height_cm"].fillna('-2')
df_merged["product_width_cm"] = df_merged["product_width_cm"].fillna('-2')
df_merged["review_comment_message"] = df_merged["product_width_cm"].fillna('-2')

# StrType
df_merged["review_comment_title"] = df_merged["review_comment_title"].fillna('-2')

df_merged.isna().sum()

Unnamed: 0,0
order_id,0
customer_id,0
order_status,0
order_purchase_timestamp,0
order_approved_at,0
order_delivered_carrier_date,0
order_delivered_customer_date,0
order_estimated_delivery_date,0
order_item_id,0
product_id,0


### Selecting and Creating some columns

In [161]:
# Selecting some Columns
df_analysis = df_merged[
    [
        "order_id",
        "seller_id",
        "customer_id",
        "order_purchase_timestamp",
        "order_approved_at",
        "shipping_limit_date",
        "order_estimated_delivery_date",
        "order_delivered_carrier_date",
        "order_delivered_customer_date",
        "order_status",
        "product_id",
        "seller_city",
        "seller_state",
        "customer_city",
        "customer_state",
        "product_category_name_english",
        "payment_type",
        "price",
        "freight_value",
        "payment_value",
        "review_score",
        "review_creation_date",
        "review_answer_timestamp",
        ]
    ].copy()


# Creating some columns
df_analysis["order_delivered_averaga_date"] = (df_analysis["order_delivered_customer_date"] - df_analysis["order_approved_at"]).dt.days

# Preview
df_analysis.sample(5)

Unnamed: 0,order_id,seller_id,customer_id,order_purchase_timestamp,order_approved_at,shipping_limit_date,order_estimated_delivery_date,order_delivered_carrier_date,order_delivered_customer_date,order_status,...,customer_state,product_category_name_english,payment_type,price,freight_value,payment_value,review_score,review_creation_date,review_answer_timestamp,order_delivered_averaga_date
97676,9370e7790e15adbc2492065d34524285,3d871de0142ce09b7081e2b9d1733cb1,21b0f669f2a38c34edc678dd70605a4a,2017-09-10 15:33:31,2017-09-12 04:45:15,2017-09-18 04:45:15,2017-10-04,2017-09-12 20:03:36,2017-09-20 15:26:56,delivered,...,MG,toys,boleto,29.0,16.11,45.11,5,2017-09-21,2017-09-23 22:18:19,8.0
36513,580603672a21252f21fa8a8b4ca85986,f214d28e8d8e3ef068748498ccc2f813,c742ac1fd3f3b453ea59c439d6dc4e39,2018-04-19 22:05:25,2018-04-19 22:15:19,2018-04-25 22:15:19,2018-05-08,2018-04-20 19:37:44,2018-04-26 17:44:27,delivered,...,MG,toys,credit_card,34.99,15.23,100.44,2,2018-04-27,2018-04-27 21:35:33,6.0
84181,0b20c9791c7a502fc70f7f542e9a1405,85d9eb9ddc5d00ca9336a2219c97bb13,2d0386ffa6a583f65981153d4f3ad5a4,2018-07-20 19:05:52,2018-07-20 19:35:14,2018-08-01 19:35:14,2018-08-23,2018-07-26 13:39:00,2018-08-02 17:34:53,delivered,...,PA,computers_accessories,credit_card,22.32,22.88,45.2,5,2018-08-03,2018-08-04 12:13:17,12.0
108147,d07d0cce426d7c1f4043cee01fe073da,cbd996ad3c1b7dc71fd0e5f5df9087e2,278585b98e91dea37fc25cbc9ea52e79,2018-04-08 18:44:10,2018-04-08 18:55:19,2018-04-12 18:55:19,2018-05-04,2018-04-10 00:47:30,2018-04-20 01:16:48,delivered,...,RJ,food,credit_card,56.97,18.02,74.99,5,2018-04-20,2018-04-22 21:55:12,11.0
73496,503c7d5d134281cbc25783a87057f690,fffd5413c0700ac820c7069d66d98c89,ac3be1c7d1ff665931f887f4a1534c41,2018-06-15 08:34:15,2018-06-15 08:56:05,2018-06-21 08:56:05,2018-07-17,2018-06-15 14:09:00,2018-06-19 13:08:34,delivered,...,RJ,housewares,credit_card,126.0,89.55,215.55,5,2018-06-20,2018-06-22 23:57:32,4.0


### Exploratory Analysis

Top 7 of most sellers by seller state

In [162]:
# Top 7 of most sellers by seller state
df_most_sellers_by_seller_state = df_analysis.groupby(["seller_state"]).agg({"payment_value": "sum", "seller_state": "count"}).rename(columns={"seller_state": "total_sales", "payment_value": "total_sale_value"}).sort_values(by="total_sales", ascending=False)
df_most_sellers_by_seller_state.head(7)

Unnamed: 0_level_0,total_sale_value,total_sales
seller_state,Unnamed: 1_level_1,Unnamed: 2_level_1
SP,13102725.34,82417
MG,1524405.49,9014
PR,1826605.49,8964
RJ,1075181.72,4906
SC,880990.16,4221
RS,551309.32,2224
DF,135787.01,937


Top 7 of most purchases by customer state

In [163]:
# Top 7 of most purchases by customer state
df_most_purchases_by_customer_state = df_analysis.groupby(["customer_state"]).agg({"payment_value": "sum", "customer_state": "count"}).rename(columns={"payment_value": "total_payment_value", "customer_state": "total_purchases"}).sort_values(by="total_purchases", ascending=False)
df_most_purchases_by_customer_state.head(7)

Unnamed: 0_level_0,total_payment_value,total_purchases
customer_state,Unnamed: 1_level_1,Unnamed: 2_level_1
SP,7502926.95,48797
RJ,2708839.33,14987
MG,2288949.71,13429
RS,1131899.22,6413
PR,1055747.81,5879
SC,769744.94,4218
BA,780334.54,3942


Top 7 average of most expensive freight by customer state

In [164]:
# Top 7 average of most expensive freight by customer state
df_avg_most_expensive_freight_by_customer = df_analysis.groupby(["customer_state"]).agg({"freight_value": "mean"}).rename(columns={"freight_value": "total_freight_value"}).sort_values(by="total_freight_value", ascending=False)
df_avg_most_expensive_freight_by_customer.head(7)

Unnamed: 0_level_0,total_freight_value
customer_state,Unnamed: 1_level_1
RR,43.587
PB,43.43685
RO,41.077849
AC,40.232473
TO,40.007778
PI,39.205989
MA,38.293858


Top 7 average of most days to delivered by customer state

In [165]:
# Top 7 average of most days to delivered by customer state
df_avg_most_days_to_delivered_by_customer_state = df_analysis.groupby("customer_state").agg({"order_delivered_averaga_date": "mean"}).rename(columns={"order_delivered_averaga_date": "days_to_delivery"}).sort_values(by="days_to_delivery", ascending=False)
df_avg_most_days_to_delivered_by_customer_state.head(7)

Unnamed: 0_level_0,days_to_delivery
customer_state,Unnamed: 1_level_1
RR,27.431818
AP,26.890244
AM,25.538922
AL,23.490868
PA,22.507576
MA,20.469641
SE,20.353403
