> ### Importando packages

In [1]:
# import sys
# !{sys.executable} -m pip install pandas
# !{sys.executable} -m pip install pandera
# !{sys.executable} -m pip install beeprint

In [2]:
import pandas as pd
import pandera as pa # Statistical Data Validation Toolkit for Pandas
from beeprint import pp as bee # Debug printing more friendly

> ### Lendo order_dataset.csv

In [3]:
df_pedidos = pd.read_csv('~/Jupyter/notebook/projects/case_enfase/data/orders_dataset.csv')
df_pedidos.rename(columns={'order_id':'id_pedido','customer_id':'id_cliente','order_status':'status_pedido','order_purchase_timestamp':'data_hora_compra','order_approved_at':'data_hora_aprovacao','order_delivered_carrier_date':'data_hora_envio','order_delivered_customer_date':'data_hora_entrega','order_estimated_delivery_date':'data_hora_previsao_entrega'}, inplace=True) # renomeando para português

> ### Buscando inconsistências orders_dataset.csv

In [4]:
df_pedidos.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   id_pedido                   99441 non-null  object
 1   id_cliente                  99441 non-null  object
 2   status_pedido               99441 non-null  object
 3   data_hora_compra            99441 non-null  object
 4   data_hora_aprovacao         99281 non-null  object
 5   data_hora_envio             97658 non-null  object
 6   data_hora_entrega           96476 non-null  object
 7   data_hora_previsao_entrega  99441 non-null  object
dtypes: object(8)
memory usage: 6.1+ MB


In [5]:
df_pedidos.drop_duplicates(inplace=True) 
# dropando id_pedido duplicados

In [6]:
df_pedidos.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   id_pedido                   99441 non-null  object
 1   id_cliente                  99441 non-null  object
 2   status_pedido               99441 non-null  object
 3   data_hora_compra            99441 non-null  object
 4   data_hora_aprovacao         99281 non-null  object
 5   data_hora_envio             97658 non-null  object
 6   data_hora_entrega           96476 non-null  object
 7   data_hora_previsao_entrega  99441 non-null  object
dtypes: object(8)
memory usage: 6.8+ MB


In [7]:
assert df_pedidos.id_pedido.count() == df_pedidos.id_pedido.nunique() # verificando a unicidade dos ids

In [8]:
df_pedidos.isna().any() # verificando a nulidade dos dados

id_pedido                     False
id_cliente                    False
status_pedido                 False
data_hora_compra              False
data_hora_aprovacao            True
data_hora_envio                True
data_hora_entrega              True
data_hora_previsao_entrega    False
dtype: bool

> ### Trabalhando a tipagem dos dados orders_dataset.csv

In [9]:
df_pedidos.dtypes

id_pedido                     object
id_cliente                    object
status_pedido                 object
data_hora_compra              object
data_hora_aprovacao           object
data_hora_envio               object
data_hora_entrega             object
data_hora_previsao_entrega    object
dtype: object

In [10]:
df_pedidos.status_pedido.unique() # verificando tipos de status de pedido

array(['delivered', 'invoiced', 'shipped', 'processing', 'unavailable',
       'canceled', 'created', 'approved'], dtype=object)

In [11]:
df_pedidos.replace(['unavailable', 'canceled', ''], pd.NA, inplace=True)
df_pedidos.dropna(inplace=True) # dropando dados unavailable, canceled e vazios, deixando apenas os produtos entregues
df_pedidos.isna().any()

id_pedido                     False
id_cliente                    False
status_pedido                 False
data_hora_compra              False
data_hora_aprovacao           False
data_hora_envio               False
data_hora_entrega             False
data_hora_previsao_entrega    False
dtype: bool

In [12]:
df_pedidos.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96455 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   id_pedido                   96455 non-null  object
 1   id_cliente                  96455 non-null  object
 2   status_pedido               96455 non-null  object
 3   data_hora_compra            96455 non-null  object
 4   data_hora_aprovacao         96455 non-null  object
 5   data_hora_envio             96455 non-null  object
 6   data_hora_entrega           96455 non-null  object
 7   data_hora_previsao_entrega  96455 non-null  object
dtypes: object(8)
memory usage: 6.6+ MB


In [13]:
df_pedidos.status_pedido.unique()

array(['delivered'], dtype=object)

In [14]:
# convertendo datas de string para datetime
df_pedidos['data_hora_compra'] = pd.to_datetime(df_pedidos['data_hora_compra'], format='%Y-%m-%d %H:%M:%S') 
df_pedidos['data_hora_aprovacao'] = pd.to_datetime(df_pedidos['data_hora_aprovacao'], format='%Y-%m-%d %H:%M:%S') 
df_pedidos['data_hora_entrega'] = pd.to_datetime(df_pedidos['data_hora_entrega'], format='%Y-%m-%d %H:%M:%S') 
df_pedidos['data_hora_envio'] = pd.to_datetime(df_pedidos['data_hora_envio'], format='%Y-%m-%d %H:%M:%S') 
df_pedidos['data_hora_previsao_entrega'] = pd.to_datetime(df_pedidos['data_hora_previsao_entrega'], format='%Y-%m-%d %H:%M:%S') 

In [15]:
# criando schema de validação com pandera
schema_pedidos = pa.DataFrameSchema(
    columns={
        "id_pedido": pa.Column(pa.String, nullable=False, required=True),
        "id_cliente": pa.Column(pa.String, nullable=False, required=True),
        "status_pedido": pa.Column(pa.String, pa.Check.isin(['delivered']), nullable=False, required=True),
        "data_hora_compra": pa.Column(pa.DateTime, nullable=False, required=True),
        "data_hora_aprovacao": pa.Column(pa.DateTime, nullable=False, required=True),
        "data_hora_entrega": pa.Column(pa.DateTime, nullable=False, required=True),
        "data_hora_envio": pa.Column(pa.DateTime, nullable=False, required=True),
        "data_hora_previsao_entrega": pa.Column(pa.DateTime, nullable=False, required=True),      
    },
)

In [16]:
schema_pedidos.validate(df_pedidos) # validando schema_itens_pedidos

Unnamed: 0,id_pedido,id_cliente,status_pedido,data_hora_compra,data_hora_aprovacao,data_hora_envio,data_hora_entrega,data_hora_previsao_entrega
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26
...,...,...,...,...,...,...,...,...
99436,9c5dedf39a927c1b2549525ed64a053c,39bd1228ee8140590ac3aca26f2dfe00,delivered,2017-03-09 09:54:05,2017-03-09 09:54:05,2017-03-10 11:18:03,2017-03-17 15:08:01,2017-03-28
99437,63943bddc261676b46f01ca7ac2f7bd8,1fca14ff2861355f6e5f14306ff977a7,delivered,2018-02-06 12:58:58,2018-02-06 13:10:37,2018-02-07 23:22:42,2018-02-28 17:37:56,2018-03-02
99438,83c1379a015df1e13d02aae0204711ab,1aa71eb042121263aafbe80c1b562c9c,delivered,2017-08-27 14:46:43,2017-08-27 15:04:16,2017-08-28 20:52:26,2017-09-21 11:24:17,2017-09-27
99439,11c177c8e97725db2631073c19f07b62,b331b74b18dc79bcdf6532d51e1637c1,delivered,2018-01-08 21:28:27,2018-01-08 21:36:21,2018-01-12 15:35:03,2018-01-25 23:32:54,2018-02-15


> ### Exportando para clean_orders_dataset.csv

In [17]:
df_pedidos.to_csv(r'~/Jupyter/notebook/projects/case_enfase/data/clean_orders_dataset.csv', index = False, header=True)