In [1]:
import pandas as pd
import os
import random
from datetime import timedelta
import numpy as np


In [2]:
DATA_ROOT = "data"
MAX_DELTA_TO_RANDOMIZE = 60

In [3]:
def get_unique_val_col(df):
    #TODO add docstring
    return {c:df[c].nunique() for c in df.columns}


def add_root(file):
    #TODO add docstring
    return os.path.join(DATA_ROOT,file)


def randomize_date_col(df, date_col, max_delta):
    #TODO add docstring
    df[f"{date_col}_random"] = df[date_col].apply(lambda x: x - timedelta(days=random.randint(1,max_delta)))
    return df


def coalesce_two_cols(df, cols, new_col):
    #TODO add docstring
    col1, col2 = cols
    df[new_col] = np.where(df[col1].isnull(), df[col2], df[col1])
    return df



In [12]:
## Review columns for each file
for f in os.listdir(DATA_ROOT):
    df = pd.read_csv(add_root(f))
    columns_val = get_unique_val_col(df)
    print(f'Overview for file {f}')
    print(f'Shape is {df.shape}')
    print(f'Columns\n{columns_val}')
    print()

Overview for file olist_closed_deals_dataset.csv
Shape is (842, 14)
Columns
{'mql_id': 842, 'seller_id': 842, 'sdr_id': 32, 'sr_id': 22, 'won_date': 824, 'business_segment': 33, 'lead_type': 8, 'lead_behaviour_profile': 9, 'has_company': 2, 'has_gtin': 2, 'average_stock': 6, 'business_type': 3, 'declared_product_catalog_size': 33, 'declared_monthly_revenue': 27}

Overview for file olist_customers_dataset.csv
Shape is (99441, 5)
Columns
{'customer_id': 99441, 'customer_unique_id': 96096, 'customer_zip_code_prefix': 14994, 'customer_city': 4119, 'customer_state': 27}

Overview for file olist_geolocation_dataset.csv
Shape is (1000163, 5)
Columns
{'geolocation_zip_code_prefix': 19015, 'geolocation_lat': 717360, 'geolocation_lng': 717613, 'geolocation_city': 8011, 'geolocation_state': 27}

Overview for file olist_marketing_qualified_leads_dataset.csv
Shape is (8000, 4)
Columns
{'mql_id': 8000, 'first_contact_date': 336, 'landing_page_id': 495, 'origin': 10}

Overview for file olist_orders_d

## Logic for seller creation_date

If available use won date. Otherwise, use a random date between the date of the first order for such merchant in the orders dataset and the max_delta_to_randomize parameter

In [71]:
# get first order date for each merchant
orders = pd.read_csv(add_root("olist_orders_dataset.csv"), 
                    parse_dates=["order_purchase_timestamp"],
                    usecols=["order_id", "order_purchase_timestamp"]
                    )\
                    .assign(purchase_date = lambda row: row["order_purchase_timestamp"].dt.date)\
                    .loc[:,["order_id", "purchase_date"]]
        

orders_sellers = pd.read_csv(add_root("olist_order_items_dataset.csv"), usecols=["order_id", "seller_id"])\
            .drop_duplicates()


first_seller_order = orders_sellers\
            .merge(orders, how="left", on="order_id")\
            .groupby("seller_id", as_index=False)\
            .agg({"purchase_date":"min"})\
            .rename(columns={"purchase_date":"first_order_date"})\
            .pipe(randomize_date_col, "first_order_date", MAX_DELTA_TO_RANDOMIZE)\
            .drop(columns=["first_order_date"])

seller_closed_deals = pd.read_csv(add_root("olist_closed_deals_dataset.csv"), 
                                    parse_dates=["won_date"], 
                                    usecols=["seller_id", "won_date"]
                                    )\
                    .assign(deal_date = lambda row: row["won_date"].dt.date)\
                    .loc[:,["seller_id", "deal_date"]]


sellers_with_signupdate = pd.read_csv(add_root( "olist_sellers_dataset.csv"))\
                        .merge(first_seller_order, how="left", on="seller_id")\
                        .merge(seller_closed_deals, how="left", on="seller_id")\
                        .pipe(coalesce_two_cols, ["deal_date", "first_order_date_random"], "signup_date")\
                        .drop(columns=["first_order_date_random", "deal_date"])


sellers_with_signupdate.to_csv(add_root("olist_sellers_dataset_enriched.csv"), index=False)

### Relationship seller - order_id

Relationship is not 1-1, but there are very few orders with two sellers, a 0.2%. For sake of simplicity, I'll remove those orders to enforce the 1-1 relationship so that the seller can be added to the order fact tables

In [4]:
items = pd.read_csv(add_root("olist_order_items_dataset.csv"))

In [13]:
orders_with_multiple_sellers = (
        items
        .query('order_item_id > 1')
        .groupby("order_id", as_index=False)
        .agg(
            n_seller = ('seller_id', 'nunique')
        )
        .query("n_seller > 1")
    )

print('number of total orders', items.order_id.nunique())
print('number of orders with at least two different sellers', orders_with_multiple_sellers.shape[0])



number of total orders 98666
number of orders with at least two different sellers 253


In [23]:
(
    pd.read_csv(add_root("olist_orders_dataset.csv"))
    .merge(orders_with_multiple_sellers, on='order_id', how='left', indicator=True)
    .query("_merge == 'left_only'")
    .drop(columns=['n_seller', '_merge'])
    .to_csv(add_root("olist_orders_dataset_clean.csv"), index=False)
)

## Move 

## Ingestion to S3