> ### Importando packages

In [1]:
# import sys
# !{sys.executable} -m pip install pandas
# !{sys.executable} -m pip install pandera
# !{sys.executable} -m pip install beeprint

In [2]:
import pandas as pd
import pandera as pa # Statistical Data Validation Toolkit for Pandas
from beeprint import pp as bee # Debug printing more friendly

> ### Lendo order_items_dataset.csv

In [3]:
df_itens_pedidos = pd.read_csv("~/Jupyter/notebook/projects/case_enfase/data/order_items_dataset.csv")
df_itens_pedidos.rename(columns={'order_id':'id_pedido', 'order_item_id':'quantidade_itens', 'product_id':'id_produto', 'seller_id':'id_vendedor','shipping_limit_date':'data_limite_envio', 'price':'preco_unitario', 'freight_value':'valor_frete'}, inplace=True) # renomeando para português

> ### Buscando inconsistências order_items_dataset.csv

In [4]:
df_itens_pedidos.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id_pedido          112650 non-null  object 
 1   quantidade_itens   112650 non-null  int64  
 2   id_produto         112650 non-null  object 
 3   id_vendedor        112650 non-null  object 
 4   data_limite_envio  112650 non-null  object 
 5   preco_unitario     112650 non-null  float64
 6   valor_frete        112650 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 6.0+ MB


In [5]:
df_itens_pedidos.drop_duplicates(subset=['id_pedido'], keep='last', inplace=True) 
# dropando id_pedido duplicados deixando apenas o ultimo item, de forma a representar a quantidade de itens do pedido

In [6]:
assert df_itens_pedidos.id_pedido.count() == df_itens_pedidos.id_pedido.nunique() # verificando a unicidade dos ids

In [7]:
df_itens_pedidos.isna().any() # verificando a nulidade dos dados

id_pedido            False
quantidade_itens     False
id_produto           False
id_vendedor          False
data_limite_envio    False
preco_unitario       False
valor_frete          False
dtype: bool

> ### Trabalhando a tipagem dos dados order_items_dataset.csv

In [8]:
df_itens_pedidos.dtypes

id_pedido             object
quantidade_itens       int64
id_produto            object
id_vendedor           object
data_limite_envio     object
preco_unitario       float64
valor_frete          float64
dtype: object

In [9]:
df_itens_pedidos['data_limite_envio'] = pd.to_datetime(df_itens_pedidos['data_limite_envio'], format='%Y-%m-%d %H:%M:%S') # convertendo data_limite_envio de string para datetime

In [10]:
# criando schema de validação com pandera
schema_itens_pedidos = pa.DataFrameSchema(
    columns={
        "id_pedido": pa.Column(pa.String, nullable=False, required=True),
        "quantidade_itens": pa.Column(pa.Int, pa.Check.greater_than_or_equal_to(0), nullable=False, required=True),
        "id_produto": pa.Column(pa.String, nullable=False, required=True),
        "id_vendedor": pa.Column(pa.String, nullable=False, required=True),
        "data_limite_envio": pa.Column(pa.DateTime, nullable=False, required=True),
        "preco_unitario": pa.Column(pa.Float, pa.Check.greater_than_or_equal_to(0), nullable=False, required=True),
        "valor_frete": pa.Column(pa.Float, pa.Check.greater_than_or_equal_to(0), nullable=False, required=True),        
    },
)

In [11]:
schema_itens_pedidos.validate(df_itens_pedidos).head(50) # validando schema_itens_pedidos

Unnamed: 0,id_pedido,quantidade_itens,id_produto,id_vendedor,data_limite_envio,preco_unitario,valor_frete
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.9,18.14
5,00048cc3ae777c65dbb7d2a0634bc1ea,1,ef92defde845ab8450f9d70c526ef70f,6426d21aca402a131fc0a5d0960a3c90,2017-05-23 03:55:27,21.9,12.69
6,00054e8431b9d7675808bcb819fb4a32,1,8d4f2bb7e93e6710a28f34fa83ee7d28,7040e82f899a04d1b434b795a43b4617,2017-12-14 12:10:31,19.9,11.85
7,000576fe39319847cbb9d288c5617fa6,1,557d850972a7d6f792fd18ae1400d9b6,5996cddab893a4652a15592fb58ab8db,2018-07-10 12:30:45,810.0,70.75
8,0005a1a1728c9d785b8e2b08b904576c,1,310ae3c140ff94b03219ad0adc3c778f,a416b6a846a11724393025641d4edd5e,2018-03-26 18:31:29,145.95,11.65
9,0005f50442cb953dcd1d21e1fb923495,1,4535b0e1091c278dfd193e5a1d63b39f,ba143b05f0110f0dc71ad71b4466ce92,2018-07-06 14:10:56,53.99,11.4


> ### Exportando para clean_order_items_dataset.csv

In [12]:
df_itens_pedidos.to_csv(r'~/Jupyter/notebook/projects/case_enfase/data/clean_order_items_dataset.csv', index = False, header=True)