In [1]:
%%time
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys
from dataclasses import dataclass
from pathlib import Path

import pandas as pd


@dataclass
class Config:
    project_dir: Path = Path("../../")
    model_dir: Path = project_dir / "models"
    outputs_dir: Path = project_dir / "outputs"
    data_dir: Path = project_dir / "data"
    raw_dir: Path = data_dir / "raw"
    interim_dir: Path = data_dir / "interim"
    processed_dir: Path = data_dir / "processed"


config = Config()
sys.path.append(str(config.project_dir.resolve()))

CPU times: user 267 ms, sys: 95 ms, total: 362 ms
Wall time: 661 ms


In [42]:
df_orders = pd.read_csv(config.raw_dir / "olist_orders_dataset.csv")
df_order_payments = pd.read_csv(config.raw_dir / "olist_order_payments_dataset.csv")
df_order_reviews = pd.read_csv(config.raw_dir / "olist_order_reviews_dataset.csv")
df_order_items = pd.read_csv(config.raw_dir / "olist_order_items_dataset.csv")

## df_order_payments

In [6]:
print(df_order_payments.shape)
display(df_order_payments.isnull().sum())
display(df_order_payments.head())

(103886, 5)


order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45


In [38]:
df_order_payments.groupby(["order_id", "payment_sequential"])[
    "payment_type"
].count().reset_index().sort_values(by="payment_type", ascending=False)

Unnamed: 0,order_id,payment_sequential,payment_type
0,00010242fe8c5a6d1ba2dd792cb16214,1,1
69254,ab1a70d5cd14419a6f2c2aca5ee8a79b,1,1
69264,ab236b4e1403ed5ace5c91a4d05b81a8,1,1
69263,ab23407c84b934f452124bc5b5453123,1,1
69262,ab22c4691e6a8528923de7a0b4236b70,1,1
...,...,...,...
34625,54ee4ac3ad09faa2237f8a844b4d7b92,1,1
34624,54edbcd83d8fbad3487e450926ce78f0,1,1
34623,54ed1bff3096b6570d2b11064cd13e46,1,1
34622,54eca33aede78efbaaadc4c0df4ae8b7,1,1


In [10]:
df_order_payments.groupby("order_id")["payment_type"].size().reset_index(
    name="count"
).sort_values(by="count", ascending=False)

Unnamed: 0,order_id,count
97261,fa65dad1b0e818e3ccc5cb0e39231352,29
79611,ccf804e764ed5650cd8759557269dc13,26
15577,285c2e15bebd4ac83635ccc563dc71f4,22
53168,895ab968e7bb0d5659d16cd74cd1650c,21
99020,fedcd9f7ccdc8cba3a18defedd1a5547,19
...,...,...
33697,56bd45163229b35ca0ab490c1e3d3233,1
33696,56bc98e6d5b88c2cdb905f2fbec2ca3a,1
33695,56bbc7d92e6e74b8782abbf5ee336a92,1
33694,56bafc014f8ed2f34cfe598592c65fd8,1


In [15]:
df_order_payments.query("order_id  == 'fa65dad1b0e818e3ccc5cb0e39231352'").sort_values(
    by="payment_sequential"
)

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
14321,fa65dad1b0e818e3ccc5cb0e39231352,1,voucher,1,3.71
23074,fa65dad1b0e818e3ccc5cb0e39231352,2,voucher,1,8.51
65641,fa65dad1b0e818e3ccc5cb0e39231352,3,voucher,1,2.95
9985,fa65dad1b0e818e3ccc5cb0e39231352,4,voucher,1,29.16
28330,fa65dad1b0e818e3ccc5cb0e39231352,5,voucher,1,0.66
29648,fa65dad1b0e818e3ccc5cb0e39231352,6,voucher,1,5.02
82593,fa65dad1b0e818e3ccc5cb0e39231352,7,voucher,1,0.32
68853,fa65dad1b0e818e3ccc5cb0e39231352,8,voucher,1,26.02
17274,fa65dad1b0e818e3ccc5cb0e39231352,9,voucher,1,1.08
19565,fa65dad1b0e818e3ccc5cb0e39231352,10,voucher,1,12.86


In [None]:
df_order_payments_groupby_order_id_sequential_max = (
    df_order_payments.groupby("order_id")["payment_sequential"]
    .max()
    .reset_index()
    .sort_values("payment_sequential", ascending=False)
)
df_order_payments_groupby_order_id_count = (
    df_order_payments.groupby("order_id")["payment_type"]
    .size()
    .reset_index(name="count")
    .sort_values(by="count", ascending=False)
)
df_order_payments_groupby_order_id_merged = pd.merge(
    df_order_payments_groupby_order_id_sequential_max,
    df_order_payments_groupby_order_id_count,
    how="left",
    on="order_id",
)

In [39]:
df_order_payments_groupby_order_id_merged.query("payment_sequential != count").rename(
    columns={"payment_sequential": "max_payment_sequential"}
)

Unnamed: 0,order_id,max_payment_sequential,count
300,a4431cbd79dbddaae7988ce6091cbc3c,3,2
314,a079628ac8002126e75f86b0f87332e4,3,2
584,8c13aa9bfa920854ca73694d94387d40,2,1
602,b6424d3ecaedf5f49836d503d4f0d671,2,1
622,7897a2eebb3941ae97159cb88f65cb1c,2,1
...,...,...,...
2920,bf2f7a16704936db1686ba2711614520,2,1
2943,a6d23aa5f1190c09129345b363de8e37,2,1
2970,056c68d093c100017aab1f00f260705c,2,1
2996,c3a4760791fe90fe58b360dbfc812805,2,1


## df_orders

In [43]:
print(df_orders.shape)
display(df_orders.isnull().sum())
display(df_orders.head())

(99441, 8)


order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
order_estimated_delivery_date       0
dtype: int64

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00
