# Exploratory Data Analysis

## Import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
from rapidfuzz import fuzz, process
import unidecode
import requests
import json

In [141]:
sys.path.append(os.path.abspath("../src"))
from consultacep import consulta_cep

## Raw Data

In [None]:
df_customers = pd.read_csv("../data/raw/olist_customers_dataset.csv")
df_order_items = pd.read_csv("../data/raw/olist_order_items_dataset.csv")
df_order_payments = pd.read_csv("../data/raw/olist_order_payments_dataset.csv")
df_orders = pd.read_csv("../data/raw/olist_orders_dataset.csv")
df_products = pd.read_csv("../data/raw/olist_products_dataset.csv")
df_sellers = pd.read_csv("../data/raw/olist_sellers_dataset.csv")
cdf_cities = pd.read_csv("../data/raw/brazilian_cities.csv")
cdf_ceps = pd.read_excel("../data/raw/Lista_de_CEPs.xlsx")


## Checking the relevant data

### Dataframe: Customers

In [17]:
df_customers.sample(10)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
1645,9af11477b7e25648e3f0b63cacb6046b,d9575b595db7de99b327899d9ed25ff1,35010,governador valadares,MG
24939,332cd7b9d5b2e74bf7553bfac694996d,c4384a7902deb7c5b60b480573876d9a,91550,porto alegre,RS
22554,7e9b88bc5b0aa43c38d8992aa2162006,9c9005459415b7174f5afb922d4a7bf1,98280,panambi,RS
1934,6a3392d49966d00c8547fc17d85c2490,c0441350fc63bde5244872590587f1b6,83702,araucaria,PR
32626,25cc5042606c108675729a1e2967486f,eed42da7ee2f3ce26ab52758e63d855f,13276,valinhos,SP
28771,69e15a390ddbb4802714da5f52b14287,1abff90112d2ef4058b7306691e6b7a2,66085,belem,PA
83327,55cae1e7c9b2dd0420cf1f95699d77a6,9724553bac1f3f92a40de0c99327fb6c,36047,juiz de fora,MG
98523,2aaa2dfcd1c4475ac4c077cbb0b72691,151b483e44efee574bd7201e51183f55,85861,foz do iguacu,PR
92026,e4b139287dd53c041c52b2bc68a2c17f,314579e5855c24c990e92cfeacf71937,96071,pelotas,RS
11676,5ba0313d8deae58f670a73b0f8b21b5e,17c24fb77d25f05ca75a38e0dbff06a2,4937,sao paulo,SP


In [14]:
df_customers.info()
df_customers.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB


customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64

In [15]:
df_customers.duplicated().sum()

np.int64(0)

- df_customers doesn't have duplicate entrys, no columns to drop at first and no Nulls

### Dataframe: Order_items

In [18]:
df_order_items.sample(10)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
50546,72dae7e56901083df5f2cb3dd2d4841f,1,53ecc2857666872f67f44e60b13d2ed2,d9442042130b7fe579d1295f9f4a248f,2017-06-22 14:23:10,24.5,8.72
36188,520ea64853a832e3e2fe0ff5d6f44667,1,d88d701f11e14213198868c769856a38,213fafb0ca06fb3d5886579c2565791b,2018-08-15 13:30:22,109.9,9.9
7182,1047b7ef8d56907e4256da73c4734bc3,1,906b11f3bc3c98f883afa0eb67b0fac7,440dd6ab244315c632130ecfb63827b1,2017-12-18 10:50:27,127.0,16.52
3931,08dae4aaedc034ee67385e739d42d0a8,1,9686ff1a707a2d304c547f82535c5ee5,6560211a19b47992c3666cc44a7e94c0,2017-10-05 17:28:20,85.0,7.85
2266,052b00d65560313221c5692cdb22fcb3,1,2a5806f10d0f00e5ad032dd2e3c8806e,7c67e1448b00f6e969d365cea6b010ab,2018-07-31 09:10:11,169.99,26.17
8832,1432d48030ced821a4afaabcfc2a8b0f,1,e1da6ab77f4859eb17950e5df1c0f815,dd7ddc04e1b6c2c614352b383efe2d36,2018-06-18 20:52:12,49.9,14.62
87064,c5b5fbe68f87bd4ee6770536a1b30fc3,1,a02d0123079f4ae96001ba2010d1a2df,1025f0e2d44d7041d6cf58b6550e0bfa,2018-08-09 12:50:13,228.0,34.68
43563,62fffe3cf205e0deb385d92481d3cb40,1,086351823300e0339f6955b27998c186,33a6f4b1e7cdc205511e76ba1b6e0186,2018-02-12 16:00:40,115.0,14.56
81815,b9f65a22c4594715663d2ad060df1b6c,1,de80c37c338574091896b7fdc81dc376,5b85809efd0d0e4dea1a9544e1280ed9,2018-07-23 18:43:10,186.9,19.41
1384,0339ed6316da69c73459e191c10ff6ac,1,9fe172fa8e662ca8572cf12abf8bce23,cab85505710c7cb9b720bceb52b01cee,2018-08-09 04:50:30,49.9,13.01


In [19]:
df_order_items.info()
df_order_items.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   order_id             112650 non-null  object 
 1   order_item_id        112650 non-null  int64  
 2   product_id           112650 non-null  object 
 3   seller_id            112650 non-null  object 
 4   shipping_limit_date  112650 non-null  object 
 5   price                112650 non-null  float64
 6   freight_value        112650 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 6.0+ MB


order_id               0
order_item_id          0
product_id             0
seller_id              0
shipping_limit_date    0
price                  0
freight_value          0
dtype: int64

In [20]:
df_order_items.duplicated().sum()

np.int64(0)

In [21]:
df_order_items.drop(columns=['shipping_limit_date', 'freight_value'], inplace=True)

- df_order_items doesn't have duplicate entrys and Nulls.
- dropping columns: shipping_limit_date, freight_value

### Dataframe: Order_payments

In [23]:
df_order_payments.sample(10)

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
8640,50b254ace467cbdb9e5181a487f569dd,1,boleto,1,60.1
77516,6d500e2b59a785bd8fec15acb73d341e,1,credit_card,3,123.82
87771,af5bd9a6ec17e248b6a9ad2c0f528acc,1,credit_card,1,95.3
56167,6616fa4c89b8bf2a7e17271cdc542fca,1,boleto,1,106.78
44990,10f79c7ff9fb9b8ce494bd5f0357aa8f,1,credit_card,3,65.22
17270,43a23a9a05a87ff4d46b5b18829fe9ae,1,boleto,1,60.55
55598,fe68b52db13993f58175fa589125d345,1,boleto,1,225.21
101571,9366d3931a502aa67cc11909c49773af,1,credit_card,2,178.27
40088,b511f786475cb429361d4d269c1bbef9,1,credit_card,1,35.19
96232,ae85b9887d49eb8e4e35cea1435a52d8,1,credit_card,8,381.86


In [24]:
df_order_payments.info()
df_order_payments.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_id              103886 non-null  object 
 1   payment_sequential    103886 non-null  int64  
 2   payment_type          103886 non-null  object 
 3   payment_installments  103886 non-null  int64  
 4   payment_value         103886 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB


order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64

In [25]:
df_order_payments.duplicated().sum()

np.int64(0)

In [26]:
df_order_payments.drop(columns=['payment_sequential','payment_installments','payment_type'], inplace=True)

- df_order_payments doesn't have duplicate entrys and Nulls.
- dropping columns: payment_sequential, payment_type, payment_installments

### Dataframe: Orders

In [28]:
df_orders.sample(10)

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
73964,e4be5aacc6848e44f045dfeb0e005447,547fdf3f3b5ff52d847e7dc145b31766,delivered,2018-02-19 19:02:56,2018-02-21 07:15:09,2018-02-23 23:09:52,2018-03-05 21:07:52,2018-03-12 00:00:00
79212,b73c36e570877e647fc56872cfbb3fa6,183df1decda4a848844388d117f8605c,delivered,2018-04-19 10:05:04,2018-04-19 10:34:22,2018-04-20 20:26:37,2018-04-23 18:32:59,2018-05-07 00:00:00
22380,f7614e634874a2b92d65dda076f0bab5,2067794c960bc6a1705e94651086144d,delivered,2018-04-09 15:04:30,2018-04-09 15:30:24,2018-04-10 18:20:42,2018-04-19 00:52:42,2018-04-25 00:00:00
96254,7c9454e9734bcf938b5b50af50269688,0c95ec410545d4fde2108fc69a4391cc,delivered,2018-04-22 11:49:37,2018-04-24 19:07:31,2018-04-24 23:41:23,2018-04-25 15:48:36,2018-05-08 00:00:00
74158,8712b255e65bd40b073d43478f7d24d1,81f8204c7b60cd52de8b41af67772d4b,delivered,2018-08-20 19:29:57,2018-08-21 09:35:16,2018-08-21 14:32:00,2018-08-22 22:22:21,2018-08-23 00:00:00
21575,500c038a3212b71500554116cf87552e,4f1bbce28fa7e3c6ba327eb99bea5b62,delivered,2017-11-29 18:09:50,2017-11-29 18:37:20,2017-12-01 17:27:43,2017-12-19 21:16:59,2017-12-27 00:00:00
16497,82f4132e89e21a9294ff6bee8e6a9a3a,3dbac935fb077003b3cd7b6e9ae9ecb7,delivered,2017-04-01 11:23:53,2017-04-04 05:45:13,2017-04-04 13:14:38,2017-05-07 10:27:33,2017-05-12 00:00:00
81744,afc0208aeafed672198f15d508bb0760,9f65cd05b72a07d989c5d1761c5cd9ca,delivered,2018-02-21 13:36:46,2018-02-21 13:51:10,2018-02-22 23:03:40,2018-03-08 14:22:51,2018-03-19 00:00:00
50510,a3ee95a310b51d1715ed8c16b7cd3ffe,67b0294ba64d7074b39bab09cb5a289f,delivered,2018-01-14 09:39:51,2018-01-16 14:11:45,2018-01-17 15:38:43,2018-01-30 22:04:49,2018-02-16 00:00:00
97288,96b313e094ee31cef7c8a4ffd3b36395,6b6dfbbc6ce8a28d6551efb5b39f74b4,delivered,2017-06-09 14:31:39,2017-06-09 14:42:22,2017-06-13 14:28:15,2017-06-22 16:42:45,2017-07-10 00:00:00


In [29]:
df_orders.info()
df_orders.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   order_id                       99441 non-null  object
 1   customer_id                    99441 non-null  object
 2   order_status                   99441 non-null  object
 3   order_purchase_timestamp       99441 non-null  object
 4   order_approved_at              99281 non-null  object
 5   order_delivered_carrier_date   97658 non-null  object
 6   order_delivered_customer_date  96476 non-null  object
 7   order_estimated_delivery_date  99441 non-null  object
dtypes: object(8)
memory usage: 6.1+ MB


order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
order_estimated_delivery_date       0
dtype: int64

In [30]:
df_orders.duplicated().sum()

np.int64(0)

In [31]:
df_orders.drop(columns=['order_approved_at','order_delivered_carrier_date','order_delivered_customer_date','order_estimated_delivery_date'], inplace=True)

- df_order_payments doesn't have duplicate entrys and Nulls on relevant columns.
- dropping columns: order_approved_at, order_delivered_carrier_date, order_delivered_customer_date, order_estimated_delivery_date

### Dataframe: Products

In [33]:
df_products.sample(10)

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
26763,bd336707deb508edc61ee62f22706cc4,papelaria,58.0,179.0,2.0,345.0,41.0,3.0,33.0
13191,f67ad6b3366ae186b8d29087e6d5ebd0,automotivo,55.0,998.0,2.0,100.0,17.0,4.0,17.0
11151,8448ca0891d431bffa57301b7dd2ad04,relogios_presentes,47.0,760.0,2.0,300.0,16.0,13.0,13.0
29796,6fba9843ddfdebe33d2625afc8001eb7,esporte_lazer,36.0,258.0,2.0,800.0,28.0,6.0,26.0
29591,0b0c04563408d8d1187cdbd6292a89fa,fashion_bolsas_e_acessorios,34.0,222.0,2.0,3400.0,20.0,50.0,40.0
16192,28f61ad35fb219e9debd750a73b63985,informatica_acessorios,55.0,191.0,2.0,100.0,40.0,2.0,24.0
2440,1ebd1450a2b8d51ffda696956426024a,cama_mesa_banho,45.0,102.0,1.0,300.0,26.0,3.0,21.0
14151,83f92096793dec6fa40579e63590c90e,papelaria,42.0,106.0,2.0,300.0,24.0,7.0,14.0
29288,97df437aeb3472b9d42b4c2e9a9d88f6,brinquedos,17.0,890.0,1.0,2600.0,35.0,25.0,35.0
21860,aa864f45a26bbc494d37ae12b0065222,automotivo,58.0,955.0,4.0,50.0,16.0,7.0,11.0


In [34]:
df_products.info()
df_products.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_id                  32951 non-null  object 
 1   product_category_name       32341 non-null  object 
 2   product_name_lenght         32341 non-null  float64
 3   product_description_lenght  32341 non-null  float64
 4   product_photos_qty          32341 non-null  float64
 5   product_weight_g            32949 non-null  float64
 6   product_length_cm           32949 non-null  float64
 7   product_height_cm           32949 non-null  float64
 8   product_width_cm            32949 non-null  float64
dtypes: float64(7), object(2)
memory usage: 2.3+ MB


product_id                      0
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g                2
product_length_cm               2
product_height_cm               2
product_width_cm                2
dtype: int64

In [35]:
df_products.duplicated().sum()

np.int64(0)

In [36]:

df_products.drop(columns= ['product_name_lenght','product_description_lenght',
    'product_photos_qty','product_weight_g','product_length_cm','product_height_cm',
    'product_width_cm'], inplace=True)

In [38]:
df_products['product_category_name'] = df_products['product_category_name'].fillna('sem categoria')

- df_products doesn't have duplicate entrys.
- Null cells in product_category_name filled with placeholder 'sem categoria'
- dropping columns: 'product_name_lenght','product_description_lenght','product_photos_qty','product_weight_g','product_length_cm','product_height_cm','product_width_cm'

### Dataframe: Sellers

In [40]:
df_sellers.sample(10)

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
2635,080102cd0a76b09e0dcf55fcacc60e05,31140,belo horizonte,MG
487,5bc24d989e71e93c33e50a7782431b0e,37165,campo do meio,MG
2164,51ed72984a04e5dd35c2b27e77179038,15050,sao jose do rio preto,SP
2840,253c025bab43881055aeecde5e2d8b77,80620,curitiba,PR
2489,12c2ed9cded6138cf0c75f525f565fc7,11709,praia grande,SP
2241,edb1ef5e36e0c8cd84eb3c9b003e486d,25957,teresopolis,RJ
2307,c64a2aec32cc408a8a4c6d7c46017f91,95625,imbe,RS
685,c03121937e54a93fcc1825c3098bbb6e,82800,curitiba,PR
1297,7813ea3ebd9411dc600b4dd01f7a97b9,6018,osasco,SP
285,a3a38f4affed601eb87a97788c949667,89204,joinville,SC


In [41]:
df_sellers.info()
df_sellers.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   seller_id               3095 non-null   object
 1   seller_zip_code_prefix  3095 non-null   int64 
 2   seller_city             3095 non-null   object
 3   seller_state            3095 non-null   object
dtypes: int64(1), object(3)
memory usage: 96.8+ KB


seller_id                 0
seller_zip_code_prefix    0
seller_city               0
seller_state              0
dtype: int64

In [42]:
df_sellers.duplicated().sum()

np.int64(0)

- df_sellers doesn't have duplicate entrys, no columns to drop at first and no Nulls

## Data Conversion and normalization

In [187]:
df_products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   product_id             32951 non-null  object
 1   product_category_name  32951 non-null  object
dtypes: object(2)
memory usage: 515.0+ KB


- Normalizing to a readable entries

In [188]:
df_products['product_category_name'] = (
    df_products['product_category_name']
    .str.replace('_', ' ')      
    .str.title()                
    .str.strip()    
)

In [190]:
sorted(df_products['product_category_name'].dropna().unique())

['Agro Industria E Comercio',
 'Alimentos',
 'Alimentos Bebidas',
 'Artes',
 'Artes E Artesanato',
 'Artigos De Festas',
 'Artigos De Natal',
 'Audio',
 'Automotivo',
 'Bebes',
 'Bebidas',
 'Beleza Saude',
 'Brinquedos',
 'Cama Mesa Banho',
 'Casa Conforto',
 'Casa Conforto 2',
 'Casa Construcao',
 'Cds Dvds Musicais',
 'Cine Foto',
 'Climatizacao',
 'Consoles Games',
 'Construcao Ferramentas Construcao',
 'Construcao Ferramentas Ferramentas',
 'Construcao Ferramentas Iluminacao',
 'Construcao Ferramentas Jardim',
 'Construcao Ferramentas Seguranca',
 'Cool Stuff',
 'Dvds Blu Ray',
 'Eletrodomesticos',
 'Eletrodomesticos 2',
 'Eletronicos',
 'Eletroportateis',
 'Esporte Lazer',
 'Fashion Bolsas E Acessorios',
 'Fashion Calcados',
 'Fashion Esporte',
 'Fashion Roupa Feminina',
 'Fashion Roupa Infanto Juvenil',
 'Fashion Roupa Masculina',
 'Fashion Underwear E Moda Praia',
 'Ferramentas Jardim',
 'Flores',
 'Fraldas Higiene',
 'Industria Comercio E Negocios',
 'Informatica Acessorios',
 

In [193]:
manual_corrections = {
    'Agro Industria E Comercio': 'Agro Indústria e Comércio',
    'Artes E Artesanato': 'Artes e Artesanato',
    'Artigos De Festas': 'Artigos de Festas',
    'Artigos De Natal': 'Artigos de Natal',
    'Beleza Saude':'Beleza e Saude',
    'Cama Mesa Banho': 'Cama Mesa e Banho',
    'Casa Conforto': 'Casa e Conforto',
    'Casa Conforto 2': 'Casa e Conforto',
    'Casa Construcao': 'Casa e Construção',
    'Cds Dvds Musicais': 'Cds Dvds e Musicais',
    'Cine Foto': 'Cine e Foto',
    'Climatizacao': 'Climatização',
    'Consoles Games': 'Consoles e Games',
    'Construcao Ferramentas Construcao': 'Artigos de Construção',
    'Construcao Ferramentas Ferramentas': 'Artigos de Construção',
    'Construcao Ferramentas Iluminacao': 'Artigos de Construção',
    'Construcao Ferramentas Jardim': 'Artigos de Construção',
    'Construcao Ferramentas Seguranca': 'Artigos de Construção',
    'Dvds Blu Ray': 'Dvds/Blu-Ray',
    'Eletrodomesticos': 'Eletrodomésticos',
    'Eletrodomesticos 2': 'Eletrodomésticos',
    'Eletronicos': 'Eletrônicos',
    'Eletroportateis': 'Eletroportáteis',
    'Esporte Lazer': 'Esporte e Lazer',
    'Fashion Bolsas E Acessorios': 'Bolsas e Acessórios',
    'Fashion Calcados':'Calçados',
    'Fashion Esporte': 'Moda Esportiva',
    'Fashion Roupa Feminina': 'Roupa Feminina',
    'Fashion Roupa Infanto Juvenil': 'Infanto Juvenil',
    'Fashion Roupa Masculina': 'Roupa Masculina',
    'Fashion Underwear E Moda Praia': 'Underwear e Moda Praia',
    'Ferramentas Jardim': 'Ferramentas de Jardinagem',
    'Fraldas Higiene': 'Fraldas e Higiene',
    'Industria Comercio E Negocios': 'Indústria Comércio e Negócios',
    'Informatica Acessorios': 'Acessórios de Informática',
    'Livros Importados': 'Livros',
    'Livros Interesse Geral': 'Livros',
    'Livros Tecnicos': 'Livros',
    'Malas Acessorios': 'Malas e Acessórios',
    'Moveis Colchao E Estofado': 'Móveis',
    'Moveis Cozinha Area De Servico Jantar E Jardim': 'Móveis',
    'Moveis Decoracao': 'Móveis',
    'Moveis Escritorio': 'Móveis',
    'Moveis Quarto': 'Móveis',
    'Moveis Sala': 'Móveis',
    'Musica': 'Música',
    'Portateis Casa Forno E Cafe': 'Casa Forno e Café',
    'Portateis Cozinha E Preparadores De Alimentos': 'Cozinha e Preparadores de Alimentos',
    'Relogios Presentes': 'Relógios e Presentes',
    'Seguros E Servicos':'Seguros e Serviços',
    'Sinalizacao E Seguranca': 'Sinalização e Segurança',
    'Tablets Impressao Imagem': 'Tablets Impressão e Imagem',
    'Utilidades Domesticas': 'Utilidades Domésticas'
}
df_products['product_category_name'] = df_products['product_category_name'].replace(manual_corrections)

In [48]:
df_orders['order_purchase_timestamp'] = pd.to_datetime(df_orders['order_purchase_timestamp'])
df_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 4 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   order_id                  99441 non-null  object        
 1   customer_id               99441 non-null  object        
 2   order_status              99441 non-null  object        
 3   order_purchase_timestamp  99441 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(3)
memory usage: 3.0+ MB


In [52]:
df_order_items.info() #ok
df_order_payments.info() #ok


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   order_id       112650 non-null  object 
 1   order_item_id  112650 non-null  int64  
 2   product_id     112650 non-null  object 
 3   seller_id      112650 non-null  object 
 4   price          112650 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.3+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   order_id       103886 non-null  object 
 1   payment_value  103886 non-null  float64
dtypes: float64(1), object(1)
memory usage: 1.6+ MB


In [49]:
df_customers.info()
df_sellers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   seller_id               3095 non-null   object
 1   seller_zip_code_prefix  3095 non-null   int64 
 2   seller_city             3095 non-null   object
 3   seller_state            3095 non-null   object
dtypes: int64(1), object(3)
memory usage: 96.8

- Cities in df_customers and df_sellers has some strange entries
- Creating a brazilian city dictonary for comparision

In [55]:
cdf_cities['city_norm'] = (
    cdf_cities['city']
    .str.strip()
    .str.lower()
    .apply(lambda x: unidecode.unidecode(x) if isinstance(x, str) else x)
)

city_dict = dict(zip(cdf_cities['city_norm'], cdf_cities['city']))

- Normalizing the cities columns from df_sellers
- Creating a new column with the clean city names
- Filling with exact matches at first

In [64]:
df_sellers['seller_city'] = (
    df_sellers['seller_city']
    .str.strip()
    .str.lower()
    .apply(lambda x: unidecode.unidecode(x) if isinstance(x, str) else x)
)
df_customers['customer_city'] = (
    df_customers['customer_city']
    .str.strip()
    .str.lower()
    .apply(lambda x: unidecode.unidecode(x) if isinstance(x, str) else x)
)

df_sellers['seller_city_clean'] = df_sellers['seller_city'].map(city_dict)
df_customers['customer_city_clean'] = df_customers['customer_city'].map(city_dict)

- Counting the Nulls.

In [74]:
df_sellers['seller_city_clean'].isna().sum() #74
df_customers['customer_city_clean'].isna().sum() #654


np.int64(74)

### City normalization and validation - df_sellers

In [None]:
corrections = {}

for city in df_sellers[df_sellers['seller_city_clean'].isna()]['seller_city'].unique():
    match, score, _ = process.extractOne(city, cdf_cities['city_norm'])
    if score > 94:
        corrections[city] = cdf_cities.loc[cdf_cities['city_norm'] == match, 'city'].iloc[0]


corrections

{'balenario camboriu': 'Balneário Camboriú',
 'ferraz de  vasconcelos': 'Ferraz de Vasconcelos',
 'sao pauo': 'São Paulo',
 'cascavael': 'Cascavel',
 'santa barbara d oeste': "Santa Bárbara d'Oeste",
 'floranopolis': 'Florianópolis',
 'sao  jose dos pinhais': 'São José dos Pinhais',
 'brasilia df': 'Brasília',
 's jose do rio preto': 'São José do Rio Preto',
 'juzeiro do norte': 'Juazeiro do Norte',
 'sao bernardo do capo': 'São Bernardo do Campo',
 'mogi das cruzes / sp': 'Mogi das Cruzes',
 'sao jose dos pinhas': 'São José dos Pinhais',
 'paincandu': 'Paiçandu',
 'portoferreira': 'Porto Ferreira',
 'belo horizont': 'Belo Horizonte',
 'sao paulo sp': 'São Paulo',
 'angra dos reis rj': 'Angra dos Reis',
 'sao  paulo': 'São Paulo',
 'ao bernardo do campo': 'São Bernardo do Campo',
 'garulhos': 'Guarulhos',
 'sao jose do rio pret': 'São José do Rio Preto',
 'sao paulop': 'São Paulo',
 'scao jose do rio pardo': 'São José do Rio Pardo',
 'tabao da serra': 'Taboão da Serra',
 'riberao preto

In [114]:
df_sellers.loc[
    df_sellers['seller_city_clean'].isna(),'seller_city_clean'] = df_sellers.loc[df_sellers['seller_city_clean'].isna(), 'seller_city'].map(corrections)

In [None]:
df_sellers['seller_city_clean'].isna().sum() #48


np.int64(48)

In [122]:
corrections = {}

for city in df_sellers[df_sellers['seller_city_clean'].isna()]['seller_city'].unique():
    match, score, _ = process.extractOne(city, cdf_cities['city_norm'])
    if score > 90 and city != 'picarras':
        corrections[city] = cdf_cities.loc[cdf_cities['city_norm'] == match, 'city'].iloc[0]


corrections

{"sao miguel d'oeste": 'São Miguel do Oeste',
 'mogi das cruses': 'Mogi das Cruzes',
 'sando andre': 'Santo André',
 'ribeirao pretp': 'Ribeirão Preto',
 'sao sebastiao da grama/sp': 'São Sebastião da Grama',
 'robeirao preto': 'Ribeirão Preto'}

In [None]:
df_sellers.loc[
    df_sellers['seller_city_clean'].isna(),'seller_city_clean'] = df_sellers.loc[df_sellers['seller_city_clean'].isna(), 'seller_city'].map(corrections)
df_sellers['seller_city_clean'].isna().sum() #48

np.int64(42)

In [129]:
missmatches =['picarras','bahia','santa catarina','sp','castro pires',
'ribeirao preto / sao paulo','embu guacu','ji parana','carapicuiba / sao paulo','gama','pirituba','minas gerais','centro','vicente de carvalho']
corrections = {}

for city in df_sellers[df_sellers['seller_city_clean'].isna()]['seller_city'].unique():
    match, score, _ = process.extractOne(city, cdf_cities['city_norm'])
    if score > 85 and city not in missmatches:
        corrections[city] = cdf_cities.loc[cdf_cities['city_norm'] == match, 'city'].iloc[0]


corrections

{'lages - sc': 'Lages',
 'auriflama/sp': 'Auriflama',
 'sao paulo / sao paulo': 'São Paulo',
 'novo hamburgo, rio grande do sul, brasil': 'Novo Hamburgo',
 'cariacica / es': 'Cariacica',
 'sao paulo - sp': 'São Paulo',
 "arraial d'ajuda (porto seguro)": 'Porto Seguro',
 'santo andre/sao paulo': 'Santo André',
 'maua/sao paulo': 'Mauá',
 'rio de janeiro \\rio de janeiro': 'Rio de Janeiro',
 'barbacena/ minas gerais': 'Barbacena',
 'andira-pr': 'Andirá',
 'rio de janeiro / rio de janeiro': 'Rio de Janeiro',
 'sao paluo': 'São Paulo',
 'rio de janeiro, rio de janeiro, brasil': 'Rio de Janeiro',
 'jacarei / sao paulo': 'Jacareí'}

In [None]:
df_sellers.loc[
    df_sellers['seller_city_clean'].isna(),'seller_city_clean'] = df_sellers.loc[df_sellers['seller_city_clean'].isna(), 'seller_city'].map(corrections)
df_sellers['seller_city_clean'].isna().sum() #24

np.int64(24)

In [132]:
df_sellers[df_sellers['seller_city_clean'].isna()]

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state,seller_city_clean
287,9b013e03b2ab786505a1d3b5c0756754,11450,vicente de carvalho,SP,
405,4221a7df464f1fe2955934e30ff3a5a1,48602,bahia,BA,
517,ceb7b4fb9401cd378de7886317ad1b47,22790,04482255,RJ,
869,cbf09e831b0c11f6f23ffb51004db972,9726,sbc/sp,SP,
1004,1cbd32d00d01bb8087a5eb088612fd9c,3363,sp / sp,SP,
1050,ba8e85df286308fe68a2ce83a8f8b2d6,88135,santa catarina,SC,
1117,16bdc8cefd0e32a6f0824d296c5ad14a,12903,sp,SP,
1246,2156f2671501a81034d7d07f217609d0,4776,sp,SP,
1712,6025c79c035c3d772133b8b8238463b2,83327,pinhais/pr,PR,
1755,2c538755f1ca9540af144f266e70df6c,39801,castro pires,MG,


In [None]:
df_sellers.loc[df_sellers['seller_city'] == 'sp','seller_city_clean'] = 'São Paulo'
df_sellers.loc[df_sellers['seller_city'] == 'sp/sp','seller_city_clean'] = 'São Paulo'
df_sellers.loc[df_sellers['seller_city'] == 'sp / sp','seller_city_clean'] = 'São Paulo'
df_sellers.loc[df_sellers['seller_city'] == 'sbc/sp','seller_city_clean'] = 'São Bernardo do Campo'
df_sellers.loc[df_sellers['seller_city'] == 'sbc','seller_city_clean'] = 'São Bernardo do Campo'

In [139]:
df_sellers[df_sellers['seller_city_clean'].isna()]

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state,seller_city_clean
287,9b013e03b2ab786505a1d3b5c0756754,11450,vicente de carvalho,SP,
405,4221a7df464f1fe2955934e30ff3a5a1,48602,bahia,BA,
517,ceb7b4fb9401cd378de7886317ad1b47,22790,04482255,RJ,
1050,ba8e85df286308fe68a2ce83a8f8b2d6,88135,santa catarina,SC,
1712,6025c79c035c3d772133b8b8238463b2,83327,pinhais/pr,PR,
1755,2c538755f1ca9540af144f266e70df6c,39801,castro pires,MG,
1920,01fd077212124329bac32490e8ef80d9,14079,ribeirao preto / sao paulo,SP,
2006,eb5b6204dde3a6e03b5f1bb8479f055b,6900,embu guacu,SP,
2122,a5259c149128e82c9d6d46e0c1c812bb,76900,ji parana,RO,
2162,a1bea7061f61f6fdd9a85a6325ba1033,6311,carapicuiba / sao paulo,SP,


In [None]:
df_sellers.loc[
    df_sellers['seller_city_clean'].isna(),'seller_city_clean'] = df_sellers.loc[
        df_sellers['seller_city_clean'].isna(), 'seller_zip_code_prefix'].apply(
            lambda cep: consulta_cep(str(int(cep)))['localidade'] if pd.notna(cep) else None
)
df_sellers[df_sellers['seller_city_clean'].isna()]

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state,seller_city_clean
405,4221a7df464f1fe2955934e30ff3a5a1,48602,bahia,BA,
1755,2c538755f1ca9540af144f266e70df6c,39801,castro pires,MG,
2006,eb5b6204dde3a6e03b5f1bb8479f055b,6900,embu guacu,SP,
2122,a5259c149128e82c9d6d46e0c1c812bb,76900,ji parana,RO,
2183,d5c530f4884a75ae0dba9c148718d278,35660,centro,MG,
2589,3a52d63a8f9daf5a28f3626d7eb9bd28,71900,aguas claras df,SP,


In [None]:
df_sellers.loc[405,'seller_city_clean'] = '	Paulo Afonso'
df_sellers.loc[1755,'seller_city_clean'] = 'Teófilo Otoni'
df_sellers.loc[2006,'seller_city_clean'] = 'Embu-Guaçu'
df_sellers.loc[2122,'seller_city_clean'] = 'Ji-Paraná'
df_sellers.loc[2183,'seller_city_clean'] = 'Pará de Minas'
df_sellers.loc[2589,'seller_city_clean'] = 'Brasília'
df_sellers.loc[2589,'seller_state'] = 'DF' #The state was wrong

df_sellers[df_sellers['seller_city_clean'].isna()]

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state,seller_city_clean


In [None]:

df_sellers['seller_city'] = df_sellers['seller_city_clean']
df_sellers.drop(columns=['seller_city_clean'], inplace=True)

### City normalization and validation - df_customers

In [176]:
#df_customers[df_customers['customer_city_clean'].isna()]['customer_city'].unique()

corrections = {}

for city in df_customers[df_customers['customer_city_clean'].isna()]['customer_city'].unique():
    match, score, _ = process.extractOne(city, cdf_cities['city_norm'])
    if score > 94:
        corrections[city] = cdf_cities.loc[cdf_cities['city_norm'] == match, 'city'].iloc[0]

corrections


{'nossa senhora do remedio': 'Nossa Senhora dos Remédios',
 'santa isabel do para': 'Santa Izabel do Pará',
 'sao thome das letras': 'São Tomé das Letras',
 'santana do livramento': "Sant'Ana do Livramento",
 'araguaia': 'Araguaína',
 'palmeirinha': 'Palmeirina',
 'santa rita do ibitipoca': 'Santa Rita de Ibitipoca',
 'couto de magalhaes': 'Couto Magalhães',
 'holambra ii': 'Holambra',
 'monte verde': 'Nova Monte Verde',
 'sao luis do paraitinga': 'São Luiz do Paraitinga',
 'alexandrita': 'Alexandria',
 'santa barbara d oeste': "Santa Bárbara d'Oeste",
 'alexandra': 'Alexandria',
 'graccho cardoso': 'Gracho Cardoso',
 'vitorinos': 'Vitorino',
 "olhos d'agua": "Olho d'Água"}

In [177]:
df_customers.loc[
    df_customers['customer_city_clean'].isna(),'customer_city_clean'] = df_customers.loc[df_customers['customer_city_clean'].isna(), 'customer_city'].map(corrections)

df_customers['customer_city_clean'].isna().sum() 

np.int64(596)

- Using a local dataset of ZIP code to check products, due the high number of entries

In [None]:
df_customers['cep_expandido'] = df_customers['customer_zip_code_prefix'] * 1000
def buscar_cidade_por_cep(cep, cdf):
    row = cdf[(cdf['CEP Inicial'] <= cep) & (cdf['CEP Final'] >= cep)]
    if not row.empty:
        return row.iloc[0]['Localidade']
    return None

df_customers.loc[df_customers['customer_city_clean'].isna(), 'customer_city_clean'] = \
    df_customers.loc[df_customers['customer_city_clean'].isna(), 'cep_expandido'].apply(
        lambda cep: buscar_cidade_por_cep(cep, cdf_ceps)
    )

df_customers[df_customers['customer_city_clean'].isna()]

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,customer_city_clean,cep_expandido
706,22a3b0e62211d8e0e28e374ce3cb6041,3f9d987156fddc08ef77f52d1a15c073,13840,mogi-guacu,SP,,13840000
2079,7c029959cc691fbe5dba3ce8eee98d14,7582c4e0a81c88f5ea0ead98b083b876,13840,mogi-guacu,SP,,13840000
22204,d3c7d2b32f278d3699a3e7376f85155b,ad27629d4a95f815f18cd46f1844e1ca,13840,mogi-guacu,SP,,13840000
24338,9f5e69f7ce7e79aed191386f8104163c,0c2af34ba2bedcc4360ecec5c55affe8,13840,mogi-guacu,SP,,13840000
27190,1a6cb0e076784a5e73557aa5defd660a,0d062c3c954a9f461c2be3e19cb99ae2,13840,mogi-guacu,SP,,13840000
48964,20a91cc566729229f1ab38edb706469a,fa6eee120fe649146270cc68a4034d42,13840,mogi-guacu,SP,,13840000
85433,afb2a83dca79aa34aa11d9e0843382f1,695dee4c200e4cabdfdb01287f5fd29d,13840,mogi-guacu,SP,,13840000
85917,a4fdabd3329523ce20c34c680ec39839,178330f0a0bcf3f47c935efa2a1973c7,13840,mogi-guacu,SP,,13840000
89407,92e61855ad557bd19a438abcb81b3c9c,9625bdf299ab055652667ac9429ffffe,13840,mogi-guacu,SP,,13840000
91094,f4db8db2617de3def6b82bc375ecb725,3f10448ee7afd7fe6bf60e78cb53e776,13840,mogi-guacu,SP,,13840000


- Only one city didn't was found.

In [None]:
df_customers['customer_city_clean'] = df_customers['customer_city_clean'].fillna('Mogi Guaçu')
df_customers[df_customers['customer_city_clean'].isna()]

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,customer_city_clean,cep_expandido


In [186]:
df_customers['customer_city'] = df_customers['customer_city_clean'] 
df_customers.drop(columns=['customer_city_clean','cep_expandido'], inplace=True)

## Derived Data

- Creating dataframes dfp_order_full by:
> - Grouping all purchased items by its orders id and summing the prices.
> - Merging df_order with its respectives values spent

In [None]:
df_order_items_grouped = df_order_items.groupby('order_id')['price'].sum().reset_index()
df_order_items_grouped.rename(columns={'price': 'total_order_value'}, inplace=True)

dfp_orders_full = df_orders.merge(df_order_items_grouped, on='order_id')
dfp_orders_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98666 entries, 0 to 98665
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   order_id                  98666 non-null  object        
 1   customer_id               98666 non-null  object        
 2   order_status              98666 non-null  object        
 3   order_purchase_timestamp  98666 non-null  datetime64[ns]
 4   total_order_value         98666 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 3.8+ MB


- Creating a column to day, month and year to investigate the seasonality purchase behavior 

In [None]:
dfp_orders_full['order_month'] = dfp_orders_full['order_purchase_timestamp'].dt.month
dfp_orders_full['order_day'] = dfp_orders_full['order_purchase_timestamp'].dt.day
dfp_orders_full['order_year'] = dfp_orders_full['order_purchase_timestamp'].dt.year
dfp_orders_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98666 entries, 0 to 98665
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   order_id                  98666 non-null  object        
 1   customer_id               98666 non-null  object        
 2   order_status              98666 non-null  object        
 3   order_purchase_timestamp  98666 non-null  datetime64[ns]
 4   total_order_value         98666 non-null  float64       
 5   order_month               98666 non-null  int32         
 6   order_day                 98666 non-null  int32         
 7   order_year                98666 non-null  int32         
dtypes: datetime64[ns](1), float64(1), int32(3), object(3)
memory usage: 4.9+ MB


- Merging the the customer dataframe in order to identify each buy with their unique id

In [None]:
dfp_orders_full = dfp_orders_full.merge(
    df_customers[['customer_id','customer_unique_id']],
    on='customer_id',
    how='left'
)
dfp_orders_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98666 entries, 0 to 98665
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   order_id                  98666 non-null  object        
 1   customer_id               98666 non-null  object        
 2   order_status              98666 non-null  object        
 3   order_purchase_timestamp  98666 non-null  datetime64[ns]
 4   total_order_value         98666 non-null  float64       
 5   order_month               98666 non-null  int32         
 6   order_day                 98666 non-null  int32         
 7   order_year                98666 non-null  int32         
 8   customer_unique_id        98666 non-null  object        
dtypes: datetime64[ns](1), float64(1), int32(3), object(4)
memory usage: 5.6+ MB


- Filtering the orders by just the delivered ones

In [220]:
dfp_orders_full = dfp_orders_full.loc[dfp_orders_full['order_status'] == 'delivered']

- Creating the processed dataframe with the client tickets

In [221]:
dfp_client_ticket = dfp_orders_full.groupby('customer_unique_id')['total_order_value'].agg(['sum','mean','count']).reset_index()
dfp_client_ticket.rename(columns={'sum':'total_spent','mean':'avg_ticket','count':'order_count'}, inplace=True)


- Creating the processed dataframe with category sales

In [222]:
dfp_category_sales = df_order_items.merge(df_products[['product_id','product_category_name']], on='product_id')
dfp_category_sales = dfp_category_sales.groupby('product_category_name')['price'].agg(['sum','count']).reset_index()
dfp_category_sales.rename(columns={'sum':'total_sales','count':'units_sold'}, inplace=True)

In [None]:
dfp_orders_full = dfp_orders_full.merge(df_customers[['customer_id','customer_state','customer_city']], on='customer_id')
dfp_orders_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96478 entries, 0 to 96477
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   order_id                  96478 non-null  object        
 1   customer_id               96478 non-null  object        
 2   order_status              96478 non-null  object        
 3   order_purchase_timestamp  96478 non-null  datetime64[ns]
 4   total_order_value         96478 non-null  float64       
 5   order_month               96478 non-null  int32         
 6   order_day                 96478 non-null  int32         
 7   order_year                96478 non-null  int32         
 8   customer_unique_id        96478 non-null  object        
 9   customer_state            96478 non-null  object        
 10  customer_city             96478 non-null  object        
dtypes: datetime64[ns](1), float64(1), int32(3), object(6)
memory usage: 7.0+ MB


- Generating dataframes with sales per city and per states

In [225]:
dfp_state_sales = dfp_orders_full.groupby('customer_state')['total_order_value'].sum().reset_index()
dfp_city_sales = dfp_orders_full.groupby('customer_city')['total_order_value'].sum().reset_index()

- Generating the dataframe that maps the customer quantity per city and states

In [227]:
dfp_clients_per_state = df_customers.groupby('customer_state')['customer_unique_id'].nunique().reset_index()
dfp_clients_per_state.rename(columns={'customer_unique_id':'num_clients'}, inplace=True)


dfp_clients_per_city = df_customers.groupby('customer_city')['customer_unique_id'].nunique().reset_index()
dfp_clients_per_city.rename(columns={'customer_unique_id':'num_clients'}, inplace=True)

In [230]:
dfp_seller_revenue = df_order_items.groupby('seller_id')['price'].sum().reset_index()
dfp_seller_revenue.rename(columns={'price':'revenue_per_seller'}, inplace=True)

dfp_seller_revenue = dfp_seller_revenue.merge(
    df_sellers[['seller_id','seller_city','seller_state']],
    on='seller_id',
    how='left'
)

dfp_seller_revenue = dfp_seller_revenue.sort_values(by='revenue_per_seller', ascending=False)

## Export

- Checking for nulls on processed data frames

In [242]:
dfp_category_sales.isnull().sum()
dfp_city_sales.isnull().sum()
dfp_state_sales.isnull().sum()
dfp_client_ticket.isnull().sum()
dfp_clients_per_city.isnull().sum()
dfp_clients_per_state.isnull().sum()
dfp_orders_full.isnull().sum()
dfp_seller_revenue.isnull().sum()

seller_id             0
revenue_per_seller    0
seller_city           0
seller_state          0
dtype: int64

In [244]:
dfp_category_sales.to_csv("../data/processed/category_sales.csv", index=False)
dfp_city_sales.to_csv("../data/processed/city_sales.csv", index=False)
dfp_state_sales.to_csv("../data/processed/state_sales.csv", index=False)
dfp_client_ticket.to_csv("../data/processed/client_ticket.csv", index=False)
dfp_clients_per_city.to_csv("../data/processed/customer_per_city.csv", index=False)
dfp_clients_per_state.to_csv("../data/processed/customer_per_state.csv", index=False)
dfp_orders_full.to_csv("../data/processed/orders.csv", index=False)
dfp_seller_revenue.to_csv("../data/processed/seller_revenue.csv", index=False)
