In [1]:
import json
import numpy as np
import pandas as pd
from rdsutils.datasets import DataLoader
from src.utils import get_data_dir

In [2]:
with open("config.json", "r") as f:
    config = json.load(f)

Dataset processed currently has 4 chunks as of May 2021. To change the number of chunks, please modify the `chunk_size` field in the `config.json` file. The dataloader will group the data by id:`business_account_number`, process them, and make sure each group has in total of less than `chunk_size` transactions.

chunk_size is currently set to 10,000,000 with a m5.25xlarge machine

In [3]:
config["data"]["features"]

{'features_0': 'features/features_1621625938/features_0_1621625938.parquet',
 'features_1': 'features/features_1621625938/features_1_1621625938.parquet',
 'features_2': 'features/features_1621625938/features_2_1621625938.parquet',
 'features_3': 'features/features_1621625938/features_3_1621625938.parquet'}

### Load selected columns
---

In [4]:
# the last stage for the ETL is "features", for this task
# labeling was carried out in "features" stage for technical debt reasons

data_dir = get_data_dir(config, config["base_path"], "features")

# modify the columns
cols = ["transaction_id", "business_account_number", "transaction_datetime", 
        "is_returned", "target_60d"]

dl = DataLoader(data_dir, columns=cols)
df = dl.get_full()
df.shape

(31774405, 5)

In [5]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31774405 entries, 0 to 7939833
Data columns (total 5 columns):
 #   Column                   Dtype         
---  ------                   -----         
 0   transaction_id           object        
 1   business_account_number  int64         
 2   transaction_datetime     datetime64[ns]
 3   is_returned              object        
 4   target_60d               bool          
dtypes: bool(1), datetime64[ns](1), int64(1), object(2)
memory usage: 3.9 GB


In [6]:
df.head()

Unnamed: 0,transaction_id,business_account_number,transaction_datetime,is_returned,target_60d
0,410000070530-1513784213-1,410000070530,2017-12-20 15:36:53,,False
1,410000070530-1513784715-2,410000070530,2017-12-20 15:45:15,,False
2,410000070530-1514549936-3,410000070530,2017-12-29 12:18:56,False,False
3,410000070530-1514566560-5,410000070530,2017-12-29 16:56:00,,False
4,410000070530-1514566560-4,410000070530,2017-12-29 16:56:00,,False


In [7]:
(df.transaction_id.value_counts() == 1).all() # ensure the rows are unique

True

### Load Full Data
---

Due to the size of our data, this is not adviced

In [8]:
data_dir = get_data_dir(config, config["base_path"], "features")
dl = DataLoader(data_dir)
df = dl.get_full()
df.shape

(31774405, 253)

In [9]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31774405 entries, 0 to 7939833
Columns: 253 entries, transaction_datetime to indeterminate_60d
dtypes: bool(24), datetime64[ns, UTC](1), datetime64[ns](19), float64(148), int64(22), object(39)
memory usage: 102.3 GB
