<a href="https://colab.research.google.com/github/ruan-narici/brazilian-ecommerce-analytics/blob/main/brazilian_ecommerce_analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Desafio de Projeto: Análise de Vendas e Entregas no E-commerce Brasileiro

### Imports

In [1]:
import pandas as pd

### Load datasets

In [3]:
df_customers = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_customers_dataset.csv") #OK
# df_geolocation = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_geolocation_dataset.csv") #EXCLUDE
df_order_items = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_order_items_dataset.csv") #OK
df_order_payments = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_order_payments_dataset.csv") #OK
df_order_reviews = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_order_reviews_dataset.csv") #OK
df_orders = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_orders_dataset.csv") #OK
df_products = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_products_dataset.csv") #OK
df_sellers = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/olist_sellers_dataset.csv") #OK
df_product_category_name_translation = pd.read_csv("/content/drive/MyDrive/Documentos/Dataset/Brazilian E-Commerce Public Dataset by Olist/product_category_name_translation.csv") #OK

### Define Types

In [4]:
# DateTimeType
df_order_items["shipping_limit_date"] = pd.to_datetime(df_order_items["shipping_limit_date"])
df_order_reviews["review_creation_date"] = pd.to_datetime(df_order_reviews["review_creation_date"])
df_order_reviews["review_answer_timestamp"] = pd.to_datetime(df_order_reviews["review_answer_timestamp"])
df_orders["order_purchase_timestamp"] = pd.to_datetime(df_orders["order_purchase_timestamp"])
df_orders["order_approved_at"] = pd.to_datetime(df_orders["order_approved_at"])
df_orders["order_delivered_carrier_date"] = pd.to_datetime(df_orders["order_delivered_carrier_date"])
df_orders["order_delivered_customer_date"] = pd.to_datetime(df_orders["order_delivered_customer_date"])
df_orders["order_estimated_delivery_date"] = pd.to_datetime(df_orders["order_estimated_delivery_date"])

### Merge datasets

In [32]:
df_merged = df_orders.merge(df_order_items, on="order_id")
df_merged = df_merged.merge(df_customers, on="customer_id")
df_merged = df_merged.merge(df_sellers, on="seller_id")
df_merged = df_merged.merge(df_products, on="product_id")
df_merged = df_merged.merge(df_order_payments, on="order_id")
df_merged = df_merged.merge(df_product_category_name_translation, on="product_category_name")
df_merged = df_merged.merge(df_order_reviews, on="order_id")

df_merged.columns

Index(['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date',
       'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date',
       'price', 'freight_value', 'customer_unique_id',
       'customer_zip_code_prefix', 'customer_city', 'customer_state',
       'seller_zip_code_prefix', 'seller_city', 'seller_state',
       'product_category_name', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm',
       'payment_sequential', 'payment_type', 'payment_installments',
       'payment_value', 'product_category_name_english', 'review_id',
       'review_score', 'review_comment_title', 'review_comment_message',
       'review_creation_date', 'review_answer_timestamp'],
      dtype='object')

### Handling null values

In [21]:
# DateTimeType
df_merged["order_approved_at"] = df_merged["order_approved_at"].fillna('1900-01-01')
df_merged["order_delivered_carrier_date"] = df_merged["order_delivered_carrier_date"].fillna('1900-01-01')
df_merged["order_delivered_customer_date"] = df_merged["order_delivered_customer_date"].fillna('1900-01-01')

# FloatType
df_merged["product_weight_g"] = df_merged["product_weight_g"].fillna('-2')
df_merged["product_length_cm"] = df_merged["product_length_cm"].fillna('-2')
df_merged["product_height_cm"] = df_merged["product_height_cm"].fillna('-2')
df_merged["product_width_cm"] = df_merged["product_width_cm"].fillna('-2')
df_merged["review_comment_message"] = df_merged["product_width_cm"].fillna('-2')

# StrType
df_merged["review_comment_title"] = df_merged["review_comment_title"].fillna('-2')

df_merged.isna().sum()

Unnamed: 0,0
order_id,0
customer_id,0
order_status,0
order_purchase_timestamp,0
order_approved_at,0
order_delivered_carrier_date,0
order_delivered_customer_date,0
order_estimated_delivery_date,0
order_item_id,0
product_id,0


### Selecting some columns

In [36]:
df_analysis = df_merged[
    [
        "order_id",
        "seller_id",
        "customer_id",
        "order_purchase_timestamp",
        "order_approved_at",
        "shipping_limit_date",
        "order_estimated_delivery_date",
        "order_delivered_carrier_date",
        "order_delivered_customer_date",
        "order_status",
        "product_id",
        "seller_city",
        "seller_state",
        "customer_city",
        "customer_state",
        "product_category_name_english",
        "payment_type",
        "price",
        "freight_value",
        "payment_value",
        "review_score",
        "review_creation_date",
        "review_answer_timestamp",
        ]
    ].copy()

df_analysis.sample(5)

Unnamed: 0,order_id,seller_id,customer_id,order_purchase_timestamp,order_approved_at,shipping_limit_date,order_estimated_delivery_date,order_delivered_carrier_date,order_delivered_customer_date,order_status,...,customer_city,customer_state,product_category_name_english,payment_type,price,freight_value,payment_value,review_score,review_creation_date,review_answer_timestamp
39266,416644f246fec0176f413ba4e49a4292,83e197e95a1bbabc8c75e883ed016c47,c11f08a4824c29c57464c20ac031cfc6,2017-04-22 15:05:33,2017-04-22 15:15:19,2017-04-27 15:15:19,2017-05-19,2017-04-24 10:05:07,2017-05-11 09:33:44,delivered,...,americano do brasil,GO,books_general_interest,credit_card,119.5,21.47,140.97,5,2017-05-12,2017-05-12 21:14:40
66520,7797659fa7b8f16a68562da5976d179e,620c87c171fb2a6dd6e8bb4dec959fc6,07268a96fa4f3812925fa07de7cc3c73,2017-04-06 15:26:20,2017-04-06 15:35:15,2017-04-12 15:35:15,2017-05-12,2017-04-07 10:00:22,2017-04-27 09:56:53,delivered,...,maracanau,CE,perfumery,credit_card,79.9,25.05,50.0,5,2017-04-28,2017-04-29 21:08:17
79522,0f04257628f269da8bcc26b6839d66f5,080102cd0a76b09e0dcf55fcacc60e05,78e0ffa402b3adede5ccf60d0976f89a,2018-08-02 20:29:49,2018-08-02 20:44:14,2018-08-06 20:44:14,2018-08-27,2018-08-03 17:23:00,2018-08-15 22:53:35,delivered,...,jaguaruana,CE,computers_accessories,credit_card,139.0,51.69,190.69,4,2018-08-16,2018-08-19 09:19:20
47869,eeacc2219efa996957aa46ab696afbde,18a349e75d307f4b4cc646a691ed4216,b701465c5960839259de1bc16655ef0a,2018-06-26 17:57:47,2018-06-29 02:35:12,2018-07-06 02:35:12,2018-07-27,2018-06-29 13:58:00,2018-07-10 18:21:40,delivered,...,guaiba,RS,cine_photo,boleto,10.0,15.23,100.92,5,2018-07-11,2018-07-13 19:57:59
60040,d41944802ef44ca7d0cea1284fd0a605,850f4f8af5ea87287ac68de36e29107f,e9f0e4ef15a5dac6d7a66274fb0024e5,2017-12-16 23:46:35,2017-12-16 23:55:21,2017-12-20 23:55:21,2018-01-17,2017-12-18 19:44:17,2017-12-27 23:35:28,delivered,...,firminopolis,GO,toys,credit_card,79.9,16.32,96.22,5,2017-12-28,2017-12-29 02:28:23


### Exploratory Analysis

In [48]:
# Top 10 average of most expensive freight by customer state
df_top_10_avg_most_expensive_freight_by_customer = df_analysis.groupby(["customer_state"])["freight_value"].mean().sort_values(ascending=False).head(10)
df_top_10_avg_most_expensive_freight_by_customer

Unnamed: 0_level_0,freight_value
customer_state,Unnamed: 1_level_1
RR,43.587
PB,43.43685
RO,41.077849
AC,40.232473
TO,40.007778
PI,39.205989
MA,38.293858
SE,36.241527
AL,35.683033
PA,35.428538
