<a href="https://colab.research.google.com/github/ruan-narici/brazilian-ecommerce-analytics/blob/main/brazilian_ecommerce_analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Desafio de Projeto: Análise de Vendas e Entregas no E-commerce Brasileiro

### Imports

In [1]:
import pandas as pd

### Load datasets

In [3]:
df_customers = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_customers_dataset.csv") #OK
# df_geolocation = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_geolocation_dataset.csv") #EXCLUDE
df_order_items = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_order_items_dataset.csv") #OK
df_order_payments = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_order_payments_dataset.csv") #OK
df_order_reviews = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_order_reviews_dataset.csv") #OK
df_orders = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_orders_dataset.csv") #OK
df_products = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_products_dataset.csv") #OK
df_sellers = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_sellers_dataset.csv") #OK
df_product_category_name_translation = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/product_category_name_translation.csv") #OK

### Define Types

In [4]:
# DateTimeType
df_order_items["shipping_limit_date"] = pd.to_datetime(df_order_items["shipping_limit_date"])
df_order_reviews["review_creation_date"] = pd.to_datetime(df_order_reviews["review_creation_date"])
df_order_reviews["review_answer_timestamp"] = pd.to_datetime(df_order_reviews["review_answer_timestamp"])
df_orders["order_purchase_timestamp"] = pd.to_datetime(df_orders["order_purchase_timestamp"])
df_orders["order_approved_at"] = pd.to_datetime(df_orders["order_approved_at"])
df_orders["order_delivered_carrier_date"] = pd.to_datetime(df_orders["order_delivered_carrier_date"])
df_orders["order_delivered_customer_date"] = pd.to_datetime(df_orders["order_delivered_customer_date"])
df_orders["order_estimated_delivery_date"] = pd.to_datetime(df_orders["order_estimated_delivery_date"])

### Merge datasets

In [32]:
df_merged = df_orders.merge(df_order_items, on="order_id")
df_merged = df_merged.merge(df_customers, on="customer_id")
df_merged = df_merged.merge(df_sellers, on="seller_id")
df_merged = df_merged.merge(df_products, on="product_id")
df_merged = df_merged.merge(df_order_payments, on="order_id")
df_merged = df_merged.merge(df_product_category_name_translation, on="product_category_name")
df_merged = df_merged.merge(df_order_reviews, on="order_id")

df_merged.columns

Index(['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date',
       'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date',
       'price', 'freight_value', 'customer_unique_id',
       'customer_zip_code_prefix', 'customer_city', 'customer_state',
       'seller_zip_code_prefix', 'seller_city', 'seller_state',
       'product_category_name', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm',
       'payment_sequential', 'payment_type', 'payment_installments',
       'payment_value', 'product_category_name_english', 'review_id',
       'review_score', 'review_comment_title', 'review_comment_message',
       'review_creation_date', 'review_answer_timestamp'],
      dtype='object')

### Handling null values

In [21]:
# DateTimeType
df_merged["order_approved_at"] = df_merged["order_approved_at"].fillna('1900-01-01')
df_merged["order_delivered_carrier_date"] = df_merged["order_delivered_carrier_date"].fillna('1900-01-01')
df_merged["order_delivered_customer_date"] = df_merged["order_delivered_customer_date"].fillna('1900-01-01')

# FloatType
df_merged["product_weight_g"] = df_merged["product_weight_g"].fillna('-2')
df_merged["product_length_cm"] = df_merged["product_length_cm"].fillna('-2')
df_merged["product_height_cm"] = df_merged["product_height_cm"].fillna('-2')
df_merged["product_width_cm"] = df_merged["product_width_cm"].fillna('-2')
df_merged["review_comment_message"] = df_merged["product_width_cm"].fillna('-2')

# StrType
df_merged["review_comment_title"] = df_merged["review_comment_title"].fillna('-2')

df_merged.isna().sum()

Unnamed: 0,0
order_id,0
customer_id,0
order_status,0
order_purchase_timestamp,0
order_approved_at,0
order_delivered_carrier_date,0
order_delivered_customer_date,0
order_estimated_delivery_date,0
order_item_id,0
product_id,0


### Selecting some columns

In [143]:
df_analysis = df_merged[
    [
        "order_id",
        "seller_id",
        "customer_id",
        "order_purchase_timestamp",
        "order_approved_at",
        "shipping_limit_date",
        "order_estimated_delivery_date",
        "order_delivered_carrier_date",
        "order_delivered_customer_date",
        "order_status",
        "product_id",
        "seller_city",
        "seller_state",
        "customer_city",
        "customer_state",
        "product_category_name_english",
        "payment_type",
        "price",
        "freight_value",
        "payment_value",
        "review_score",
        "review_creation_date",
        "review_answer_timestamp",
        ]
    ].copy()

df_analysis.sample(5)

Unnamed: 0,order_id,seller_id,customer_id,order_purchase_timestamp,order_approved_at,shipping_limit_date,order_estimated_delivery_date,order_delivered_carrier_date,order_delivered_customer_date,order_status,...,customer_city,customer_state,product_category_name_english,payment_type,price,freight_value,payment_value,review_score,review_creation_date,review_answer_timestamp
81999,fb699452d6799cc45801c305bc0db749,caa9bc43a9fe8cf9c564ddd8a03cc4a6,68fee6738238719fc44935a11806101e,2018-06-24 13:53:51,2018-06-24 17:18:22,2018-06-28 17:18:22,2018-07-12,2018-06-25 15:08:00,2018-06-26 20:03:58,delivered,...,americana,SP,toys,credit_card,44.99,9.01,54.0,4,2018-06-27,2018-07-02 10:50:25
53596,5ac2db22c3a1a5c7d59208f225e41fef,a1043bafd471dff536d0c462352beb48,ddb4b59eafb402e9e5dcce9204b5aa21,2018-05-11 16:33:12,2018-05-11 16:59:55,2018-05-15 16:59:55,2018-06-14,2018-05-14 12:02:00,2018-06-23 19:58:32,delivered,...,rio de janeiro,RJ,garden_tools,credit_card,179.0,43.95,222.95,5,2018-06-16,2018-06-16 15:26:32
94353,ce46087fc2b4d09a468f6599e33dafd0,1f50f920176fa81dab994f9023523100,3736f7b259155b4d4b0757dcea364300,2018-03-26 11:52:38,2018-03-26 12:10:25,2018-03-30 12:10:25,2018-04-06,2018-03-28 19:57:22,2018-03-29 21:03:55,delivered,...,sao jose do rio preto,SP,garden_tools,credit_card,53.9,11.86,65.76,5,2018-03-30,2018-03-31 10:49:45
3408,762fa054ae976d76eea5688eaa2a92e6,1025f0e2d44d7041d6cf58b6550e0bfa,8205033143e83aeee71e4df23fedc372,2018-08-26 09:22:44,2018-08-26 09:35:15,2018-08-31 09:35:15,2018-09-05,2018-08-29 14:45:00,2018-08-30 20:50:41,delivered,...,guarulhos,SP,furniture_decor,credit_card,79.9,15.86,95.76,4,2018-08-31,2018-09-01 01:05:26
44468,a050701f6d3065c5615ffaccf27ec567,112453736dbe3889cfb74e1aaa0ba0c1,7e2637810f18996f6578e128406a08a8,2018-08-18 20:19:40,2018-08-18 20:35:13,2018-08-24 20:35:13,2018-09-10,2018-08-24 13:24:00,2018-08-30 14:51:09,delivered,...,palhoca,SC,auto,credit_card,135.0,18.75,307.5,5,2018-08-31,2018-09-02 20:04:18


### Exploratory Analysis

Top 7 of most sellers by seller state

In [149]:
# Top 7 of most sellers by seller state
df_most_sellers_by_seller_state = df_analysis.groupby(["seller_state"]).agg({"payment_value": "sum", "seller_state": "count"}).rename(columns={"seller_state": "total_sales", "payment_value": "total_sale_value"}).sort_values(by="total_sales", ascending=False)
df_most_sellers_by_seller_state.head(7)

Unnamed: 0_level_0,total_sale_value,total_sales
seller_state,Unnamed: 1_level_1,Unnamed: 2_level_1
SP,13102725.34,82417
MG,1524405.49,9014
PR,1826605.49,8964
RJ,1075181.72,4906
SC,880990.16,4221
RS,551309.32,2224
DF,135787.01,937


Top 7 of most purchases by customer state

In [150]:
# Top 7 of most purchases by customer state
df_most_purchases_by_customer_state = df_analysis.groupby(["customer_state"]).agg({"payment_value": "sum", "customer_state": "count"}).rename(columns={"payment_value": "total_payment_value", "customer_state": "total_purchases"}).sort_values(by="total_purchases", ascending=False)
df_most_purchases_by_customer_state.head(7)

Unnamed: 0_level_0,total_payment_value,total_purchases
customer_state,Unnamed: 1_level_1,Unnamed: 2_level_1
SP,7502926.95,48797
RJ,2708839.33,14987
MG,2288949.71,13429
RS,1131899.22,6413
PR,1055747.81,5879
SC,769744.94,4218
BA,780334.54,3942


Top 7 average of most expensive freight by customer state

In [151]:
# Top 7 average of most expensive freight by customer state
df_avg_most_expensive_freight_by_customer = df_analysis.groupby(["customer_state"]).agg({"freight_value": "mean"}).rename(columns={"freight_value": "total_freight_value"}).sort_values(by="total_freight_value", ascending=False)
df_avg_most_expensive_freight_by_customer.head(7)

Unnamed: 0_level_0,total_freight_value
customer_state,Unnamed: 1_level_1
RR,43.587
PB,43.43685
RO,41.077849
AC,40.232473
TO,40.007778
PI,39.205989
MA,38.293858


Top 7 average of most days to delivered by customer state

In [156]:
# Top 7 average of most days to delivered by customer state

df_analysis["order_delivered_averaga_date"] = (df_analysis["order_delivered_customer_date"] - df_analysis["order_approved_at"]).dt.days
df_avg_most_days_to_delivered_by_customer_state = df_analysis.groupby("customer_state").agg({"order_delivered_averaga_date": "mean"}).rename(columns={"order_delivered_averaga_date": "days_to_delivery"}).sort_values(by="days_to_delivery", ascending=False)
df_avg_most_days_to_delivered_by_customer_state.head(7)

Unnamed: 0_level_0,days_to_delivery
customer_state,Unnamed: 1_level_1
RR,27.431818
AP,26.890244
AM,25.538922
AL,23.490868
PA,22.507576
MA,20.469641
SE,20.353403
