In [2]:
import pandas as pd
import os
import random
from datetime import timedelta
import numpy as np


In [6]:
DATA_ROOT = "..\\data\\raw"
DATA_TRANSIENT = "..\\data\\transient"

In [10]:
def get_unique_val_col(df):
    """
    Return a dictionary with the names of the columns and the unique values in it
    """
    return {c:df[c].nunique() for c in df.columns}


def add_root(file, root):
    """
    Apend root to path
    """
    return os.path.join(root,file)


# Columns Overview

In [8]:
## Review columns for each file
for f in os.listdir(DATA_ROOT):
    df = pd.read_csv(add_root(f, DATA_ROOT))
    columns_val = get_unique_val_col(df)
    print(f'Overview for file {f}')
    print(f'Shape is {df.shape}')
    print(f'Columns\n{columns_val}')
    print()

Overview for file olist_closed_deals_dataset.csv
Shape is (842, 14)
Columns
{'mql_id': 842, 'seller_id': 842, 'sdr_id': 32, 'sr_id': 22, 'won_date': 824, 'business_segment': 33, 'lead_type': 8, 'lead_behaviour_profile': 9, 'has_company': 2, 'has_gtin': 2, 'average_stock': 6, 'business_type': 3, 'declared_product_catalog_size': 33, 'declared_monthly_revenue': 27}

Overview for file olist_customers_dataset.csv
Shape is (99441, 5)
Columns
{'customer_id': 99441, 'customer_unique_id': 96096, 'customer_zip_code_prefix': 14994, 'customer_city': 4119, 'customer_state': 27}

Overview for file olist_geolocation_dataset.csv
Shape is (1000163, 5)
Columns
{'geolocation_zip_code_prefix': 19015, 'geolocation_lat': 717360, 'geolocation_lng': 717613, 'geolocation_city': 8011, 'geolocation_state': 27}

Overview for file olist_orders_dataset.csv
Shape is (99441, 8)
Columns
{'order_id': 99441, 'customer_id': 99441, 'order_status': 8, 'order_purchase_timestamp': 98875, 'order_approved_at': 90733, 'order_de

# Relationship seller - order_id

Relationship is not 1-1 strictly, but there are very few orders with two sellers, a 0.2%. For sake of simplicity, I'll remove those orders to enforce the 1-1 relationship so that the seller can be added to the order fact tables

In [14]:
items = pd.read_csv(add_root("olist_order_items_dataset.csv", DATA_ROOT))

In [26]:
orders_with_multiple_sellers = (
        items
        .query('order_item_id > 1')
        .groupby("order_id", as_index=False)
        .agg(
            n_seller = ('seller_id', 'nunique')
        )
        .query("n_seller > 1")
    )

print('number of total orders', items.order_id.nunique())
print('number of orders with at least two different sellers', orders_with_multiple_sellers.shape[0])



number of total orders 98666
number of orders with at least two different sellers 253


In [28]:
list_of_files_to_modify = ["olist_orders_dataset.csv", "olist_order_payments_dataset.csv", "olist_order_items_dataset.csv", "olist_order_reviews_dataset.csv"]


for file in list_of_files_to_modify:
    print(f"Modifying {file}")
    # filter out orders with multiple vendors
    (
        pd.read_csv(add_root(file, DATA_ROOT))
        .merge(orders_with_multiple_sellers, on='order_id', how='left', indicator=True)
        .query("_merge == 'left_only'")
        .drop(columns=['n_seller', '_merge'])
        .to_csv(add_root(file, DATA_TRANSIENT), index=False)
    )
    print("Complete\n")

Modifying olist_orders_dataset.csv
Complete

Modifying olist_order_payments_dataset.csv
Complete

Modifying olist_order_items_dataset.csv
Complete

Modifying olist_order_reviews_dataset.csv
Complete



# Payments option exploration

Goal is to review the number of different payment types: 
- What's the % of orders with more than payment type
- Do we need to store them all or just the one most used?
- Is there any other payment type that has more than 1 payment installments than credit card?

In [15]:
payment = pd.read_csv(add_root("olist_order_payments_dataset.csv", DATA_TRANSIENT))
payment.head()

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45


### Number of orders with multiple payment types


96.9% of orders only have one payment, this is the most common scenario. It's interesting to see that there are orders who have more than 5 payments even though the max number of payment types are 5, this means that we can make several payments of the same type in an order. 

Regarding the payments types, the names are mostly self-explanatory (boleto = cash). Voucher payment type will be considered as discount that the marketplace or seller offers to this users. The not_defined type seems it's strange given that the payment_value is 0 but a query into the main order table tells us that it's a cancelled one. We will keep the column into the curated table. 

In conclusion, for the curated table we will use the payment_type columns in combination of columns that indicates the number of payments for each type


In [30]:
# Number of orders by number of payments
(
    payment
    .groupby('order_id', as_index=False)
    .agg(
        n_payment = ("payment_sequential", "max")
    )
    .groupby("n_payment", as_index=False)
    .agg(
        n_order = ("order_id", "count")
    )
    .assign(
        perc_order = lambda df: df.n_order.cumsum().div(df.n_order.sum()) 
    )
)

Unnamed: 0,n_payment,n_order,perc_order
0,1,96157,0.969452
1,2,2452,0.994173
2,3,302,0.997217
3,4,107,0.998296
4,5,52,0.99882
5,6,36,0.999183
6,7,28,0.999466
7,8,11,0.999577
8,9,9,0.999667
9,10,5,0.999718


In [31]:
# Number of payments
(
    payment
    .payment_type
    .value_counts(normalize=True)
)

credit_card    0.739351
boleto         0.190448
voucher        0.055442
debit_card     0.014729
not_defined    0.000029
Name: payment_type, dtype: float64

In [38]:
# Orders with not_defined payment type
mask = payment.query("payment_type == 'not_defined'").order_id.iloc[0]

print("Orders with not_defined payment type")

display (
    (
    payment
    .query("order_id == @mask")
    .sort_values(by=["payment_sequential"])
)

)

print("Orders with not_defined payment type in the olist_orders_dataset.csv")

display (

    (
    pd.read_csv(add_root("olist_orders_dataset.csv", DATA_ROOT))
    .query("order_id == @mask")
)

)

Orders with not_defined payment type


Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
51153,4637ca194b6387e2d538dc89b124b0ee,1,not_defined,1,0.0


Orders with not_defined payment type in the olist_orders_dataset.csv


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
39919,4637ca194b6387e2d538dc89b124b0ee,a73c1f73f5772cf801434bf984b0b1a7,canceled,2018-09-03 14:14:25,,,,2018-09-10 00:00:00


Number of installments in exclusively of credit card payments. The cell outputs the % of orders with more than installment

In [40]:
# Payment type with more than one instalmments
(
    payment
    .query("payment_installments > 1")
    .payment_type
    .value_counts(normalize=True)
)

credit_card    1.0
Name: payment_type, dtype: float64

# Match between Payment total value and Items value + Freight value

I'd like to understand the relationship between the total payment value and the total basket value (items + delivery) for an order. Ideally, I hope them to be equall, even if the payment type is voucher, which will count as discounts not paid by users per se but the marketplace or sellers

In addition, I'd to understand the match in terms of order_id. Should I expect that both tables contains the same orders?

In [30]:


item_gmv = (
    pd.read_csv(add_root("olist_order_items_dataset.csv", DATA_TRANSIENT))
    .groupby("order_id", as_index=False)
    .agg(
        price_sum = ("price", "sum"),
        freight_sum = ("freight_value", "sum")
    )
    .assign(
        gmv = lambda df: df.price_sum + df.freight_sum
    )
)


payment_total = (
    pd.read_csv(add_root("olist_order_payments_dataset.csv", DATA_TRANSIENT))
    .groupby("order_id", as_index=False)
    .agg(
        payment_sum = ("payment_value", "sum")
    )
)

payment_with_gmv = (
        payment_total
        .merge(item_gmv, on="order_id", how="outer", indicator=True)
)


In [31]:

(
    payment_with_gmv
    .query("_merge != 'both'")
    ._merge
    .value_counts()
)

left_only     775
right_only      1
both            0
Name: _merge, dtype: int64

# Seller creation date

Seller table does not have the date of when a seller signed up to the marketplace. The Olist closed deals does contains info about it but not for all merchants. For the sake of the project, I would like to produce a datamart which will store how sellers performs from the moment they signed up; it's essential to have the signup date. 

To solve this challenge, If the signup date is not available in the closed_deals, I'll use a random date between the date of the first order for a given seller in the orders dataset and the max_delta_to_randomize parameter 

In [11]:
def randomize_date_col(df, date_col, max_delta):
    """
    Randomize a date column by a random number defined by delta
    """
    df[f"{date_col}_random"] = df[date_col].apply(lambda x: x - timedelta(days=random.randint(1,max_delta)))
    return df
    

def coalesce_two_cols(df, cols, new_col=None):
    """
    Perform a SQL-like coalese function for two columns.
    """
    if not new_col:
        new_col = f'{cols[0]}_coalesce'

    col1, col2 = cols
    df[new_col] = np.where(df[col1].isnull(), df[col2], df[col1])
    return df

MAX_DELTA_TO_RANDOMIZE = 60

In [12]:
# get first order date for each merchant
orders = pd.read_csv(add_root("olist_orders_dataset.csv", DATA_TRANSIENT), 
                    parse_dates=["order_purchase_timestamp"],
                    usecols=["order_id", "order_purchase_timestamp"]
                    )\
                    .assign(purchase_date = lambda row: row["order_purchase_timestamp"].dt.date)\
                    .loc[:,["order_id", "purchase_date"]]
        

orders_sellers = pd.read_csv(add_root("olist_order_items_dataset.csv", DATA_TRANSIENT), usecols=["order_id", "seller_id"])\
            .drop_duplicates()


first_seller_order = orders_sellers\
            .merge(orders, how="left", on="order_id")\
            .groupby("seller_id", as_index=False)\
            .agg({"purchase_date":"min"})\
            .rename(columns={"purchase_date":"first_order_date"})\
            .pipe(randomize_date_col, "first_order_date", MAX_DELTA_TO_RANDOMIZE)\
            .drop(columns=["first_order_date"])

seller_closed_deals = pd.read_csv(add_root("olist_closed_deals_dataset.csv", DATA_ROOT), 
                                    parse_dates=["won_date"], 
                                    usecols=["seller_id", "won_date"]
                                    )\
                    .assign(deal_date = lambda row: row["won_date"].dt.date)\
                    .loc[:,["seller_id", "deal_date"]]


sellers_with_signupdate = pd.read_csv(add_root( "olist_sellers_dataset.csv", DATA_ROOT))\
                        .merge(first_seller_order, how="left", on="seller_id")\
                        .merge(seller_closed_deals, how="left", on="seller_id")\
                        .pipe(coalesce_two_cols, ["deal_date", "first_order_date_random"], "signup_date")\
                        .drop(columns=["first_order_date_random", "deal_date"])


sellers_with_signupdate.to_csv(add_root("olist_sellers_dataset.csv", DATA_TRANSIENT), index=False)