In [1]:
import pandas as pd
import numpy as np
from common.data_model import DataModel

# rankings to create

- ~~by order count, no matter the status~~
- ~~by order delivered~~
- ~~by seller delivered~~
- ~~days from last order~~
- ~~months from last order~~


## Order rankings

In [23]:
def row_number(data, partition_cols, sort_col):
    """
    SQL-like row_number function
    """
    return (
        data
        .sort_values(by= sort_col, ascending=True)
        .groupby(partition_cols)
        .cumcount() + 1
    )

def add_ranking_columns(df_):
    customer_col = "customer_unique_id"
    sort_col = "order_purchase_timestamp"

    ranking_dict = {
        "order_ranking": ([customer_col], [sort_col]), 
        "order_ranking_delivered": ([customer_col, "order_status"], [sort_col]),
        "order_at_seller_ranking_delivered": ([customer_col, "order_status", "seller_id"], [sort_col])
    }
    for col, ranking in ranking_dict.items():
        df_.loc[:, col] = row_number(df_, *ranking)
    return df_

def clean_ranking_at_delivered_columns(df_):
    """
    Set to null if order_status != delivered
    """
    mask = df_.order_status == "delivered"
    mask_columns =  df_.columns.str.contains("ranking_delivered")

    clean_ranking = (
        df_
        .loc[:, mask_columns]
        .where(mask)
        .astype("Int64")
    )

    data = (
        pd.concat(
            [df_.loc[:, ~mask_columns]
            , clean_ranking
            ]
            , axis=1
        )
    )

    return data

## Days/Month from/until order

In [13]:
def lead_window(df_: pd.DataFrame, partition_cols, sort_col, value_col, shifts = 1):
    """
    SQL-like lead function
    """
    return (
        df_
        .sort_values(by= sort_col, ascending=True)
        .groupby(partition_cols)[value_col]
        .shift(shifts)
    )

def add_last_order_timestamp(df_):
    """
    Add last_order_timestamp column to the dataframe
    """
    df_.loc[:, "last_order_timestamp"] = lead_window(df_, ["customer_unique_id"], ["order_purchase_timestamp"], "order_purchase_timestamp", 1)
    return df_

def days_diff(start,end):
    """
    Return the timedelta between two datetime columns in days
    """
    return (end - start).dt.days

def month_diff(start,end):
    """
    Return the timedelta between two datetime columns in months
    """
    start = start.dt.to_period("M").view("float64")
    end = end.dt.to_period("M").view("float64")
    return (end - start).round(0).astype("Int64")

def add_time_diff_columns(df_):
    """
    Add time differences columns to the dataframe
    """
    return (    df_.assign(
                    days_from_last_order = days_diff(df_.last_order_timestamp, df_.order_purchase_timestamp),
                    month_from_last_order = month_diff(df_.last_order_timestamp, df_.order_purchase_timestamp)
        )
        )

def clean_month_diff(df_):
    """
    Set to NA columns that does not have a previous order to calculate the time difference
    """
    df_.loc[:, "month_from_last_order"] = np.where(df_.last_order_timestamp.isnull(), pd.NA, df_.month_from_last_order)
    return df_




# Test pipeline

In [25]:
df = DataModel.read_parquet_to_dataframe("transient", "orders_lean.parquet")
df.head()

Unnamed: 0,order_id,seller_id,customer_unique_id,order_status,order_created_date,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,...,total_freight,n_items,customer_zip_code_prefix,customer_city,customer_state,seller_zip_code_prefix,seller_city,seller_state,residual,is_payment_madeup
0,e481f51cbdc54678b7cc49136f2d6af7,3504c0cb71d7fa48d967e0e4c94d59d9,7c396fd4830fd04220f754e42b4e5bff,delivered,2017-10-02,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,...,8.72,1,3149,sao paulo,SP,9350,maua,SP,0.0,False
1,53cdb2fc8bc7dce0b6741e2150273451,289cdb325fb7e7f891c38608bf9e0962,af07308b275d755c9edb36a90c618231,delivered,2018-07-24,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13,...,22.76,1,47813,barreiras,BA,31570,belo horizonte,SP,0.0,False
2,47770eb9100c2d0c44946d9cf07ec65d,4869f7a5dfa277a7dca6462dcf3b52b2,3a653a41f6f9fc3d2a113cf8398680e8,delivered,2018-08-08,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04,...,19.22,1,75265,vianopolis,GO,14840,guariba,SP,0.0,False
3,949d5b44dbf5de918fe9c16f97b45f8a,66922902710d126a0e7d26b0e3805106,7c142cf63193a1473d2e66489a9ae977,delivered,2017-11-18,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15,...,27.2,1,59296,sao goncalo do amarante,RN,31842,belo horizonte,MG,0.0,False
4,ad21c59c0840e6cb83a9ceb5573f8159,2c9e548be18521d1c43cde1c582c6de8,72632f0f9dd73dfee390c9b22eb56dd6,delivered,2018-02-13,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26,...,8.72,1,9195,santo andre,SP,8752,mogi das cruzes,SP,0.0,False


In [26]:
(
    df
    .pipe(add_ranking_columns)
    .pipe(clean_ranking_at_delivered_columns )
    .pipe(add_last_order_timestamp)
    .pipe(add_time_diff_columns)
    .pipe(clean_month_diff)
)

Unnamed: 0,order_id,seller_id,customer_unique_id,order_status,order_created_date,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,...,seller_city,seller_state,residual,is_payment_madeup,order_ranking,order_ranking_delivered,order_at_seller_ranking_delivered,last_order_timestamp,days_from_last_order,month_from_last_order
0,e481f51cbdc54678b7cc49136f2d6af7,3504c0cb71d7fa48d967e0e4c94d59d9,7c396fd4830fd04220f754e42b4e5bff,delivered,2017-10-02,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,...,maua,SP,0.0,False,2,2,1,2017-09-04 11:26:38,27.0,0
1,53cdb2fc8bc7dce0b6741e2150273451,289cdb325fb7e7f891c38608bf9e0962,af07308b275d755c9edb36a90c618231,delivered,2018-07-24,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13,...,belo horizonte,SP,0.0,False,1,1,1,NaT,,
2,47770eb9100c2d0c44946d9cf07ec65d,4869f7a5dfa277a7dca6462dcf3b52b2,3a653a41f6f9fc3d2a113cf8398680e8,delivered,2018-08-08,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04,...,guariba,SP,0.0,False,1,1,1,NaT,,
3,949d5b44dbf5de918fe9c16f97b45f8a,66922902710d126a0e7d26b0e3805106,7c142cf63193a1473d2e66489a9ae977,delivered,2017-11-18,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15,...,belo horizonte,MG,0.0,False,1,1,1,NaT,,
4,ad21c59c0840e6cb83a9ceb5573f8159,2c9e548be18521d1c43cde1c582c6de8,72632f0f9dd73dfee390c9b22eb56dd6,delivered,2018-02-13,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26,...,mogi das cruzes,SP,0.0,False,1,1,1,NaT,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98158,9c5dedf39a927c1b2549525ed64a053c,e24fc9fcd865784fb25705606fe3dfe7,6359f309b166b0196dbf7ad2ac62bb5a,delivered,2017-03-09,2017-03-09 09:54:05,2017-03-09 09:54:05,2017-03-10 11:18:03,2017-03-17 15:08:01,2017-03-28,...,braganca paulista,SP,0.0,False,1,1,1,NaT,,
98159,63943bddc261676b46f01ca7ac2f7bd8,1f9ab4708f3056ede07124aad39a2554,da62f9e57a76d978d02ab5362c509660,delivered,2018-02-06,2018-02-06 12:58:58,2018-02-06 13:10:37,2018-02-07 23:22:42,2018-02-28 17:37:56,2018-03-02,...,tupa,SP,0.0,False,1,1,1,NaT,,
98160,83c1379a015df1e13d02aae0204711ab,d50d79cb34e38265a8649c383dcffd48,737520a9aad80b3fbbdad19b66b37b30,delivered,2017-08-27,2017-08-27 14:46:43,2017-08-27 15:04:16,2017-08-28 20:52:26,2017-09-21 11:24:17,2017-09-27,...,sao paulo,SP,0.0,False,1,1,1,NaT,,
98161,11c177c8e97725db2631073c19f07b62,a1043bafd471dff536d0c462352beb48,5097a5312c8b157bb7be58ae360ef43c,delivered,2018-01-08,2018-01-08 21:28:27,2018-01-08 21:36:21,2018-01-12 15:35:03,2018-01-25 23:32:54,2018-02-15,...,ilicinea,MG,0.0,False,1,1,1,NaT,,


In [27]:
df.head()

Unnamed: 0,order_id,seller_id,customer_unique_id,order_status,order_created_date,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,...,customer_city,customer_state,seller_zip_code_prefix,seller_city,seller_state,residual,is_payment_madeup,order_ranking,order_ranking_delivered,order_at_seller_ranking_delivered
0,e481f51cbdc54678b7cc49136f2d6af7,3504c0cb71d7fa48d967e0e4c94d59d9,7c396fd4830fd04220f754e42b4e5bff,delivered,2017-10-02,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,...,sao paulo,SP,9350,maua,SP,0.0,False,2,2,1
1,53cdb2fc8bc7dce0b6741e2150273451,289cdb325fb7e7f891c38608bf9e0962,af07308b275d755c9edb36a90c618231,delivered,2018-07-24,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13,...,barreiras,BA,31570,belo horizonte,SP,0.0,False,1,1,1
2,47770eb9100c2d0c44946d9cf07ec65d,4869f7a5dfa277a7dca6462dcf3b52b2,3a653a41f6f9fc3d2a113cf8398680e8,delivered,2018-08-08,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04,...,vianopolis,GO,14840,guariba,SP,0.0,False,1,1,1
3,949d5b44dbf5de918fe9c16f97b45f8a,66922902710d126a0e7d26b0e3805106,7c142cf63193a1473d2e66489a9ae977,delivered,2017-11-18,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15,...,sao goncalo do amarante,RN,31842,belo horizonte,MG,0.0,False,1,1,1
4,ad21c59c0840e6cb83a9ceb5573f8159,2c9e548be18521d1c43cde1c582c6de8,72632f0f9dd73dfee390c9b22eb56dd6,delivered,2018-02-13,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26,...,santo andre,SP,8752,mogi das cruzes,SP,0.0,False,1,1,1
