In [1]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from scml import pandasx as pdx

In [2]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [3]:
txn2id = {
    "clicks": 0,
    "carts": 1,
    "orders": 2,
}
rows = []
input_files = [Path("input/train.jsonl"), Path("input/test.jsonl")]
for fp in input_files:
    with open(str(fp)) as lines:
        for line in tqdm(lines, desc=str(fp)):
            jo = json.loads(line)
            session = int(jo["session"])
            prev = None
            for event in jo["events"]:
                curr = int(event["ts"])
                if prev is not None and curr<prev:
                    raise ValueError("event out-of-order")
                rows.append({
                    "session": session,
                    "aid": int(event["aid"]),
                    "ts": curr,
                    "type": txn2id[event["type"]]
                })
                prev = curr

input\train.jsonl: 12899779it [06:34, 32700.81it/s]
input\test.jsonl: 1671803it [00:16, 99062.85it/s] 


In [4]:
df = pd.DataFrame.from_records(rows)
cols = ["session", "aid"]
df[cols] = df[cols].astype(np.int32)
cols = ["type"]
df[cols] = df[cols].astype(np.int8)
#df.sort_values(["session", "ts"], inplace=True, ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223644219 entries, 0 to 223644218
Data columns (total 4 columns):
 #   Column   Dtype
---  ------   -----
 0   session  int32
 1   aid      int32
 2   ts       int64
 3   type     int8 
dtypes: int32(2), int64(1), int8(1)
memory usage: 3.5 GB


In [5]:
df.describe(percentiles=percentiles)

Unnamed: 0,session,aid,ts,type
count,223644200.0,223644200.0,223644200.0,223644200.0
mean,4981796.0,928720.8,1660580000000.0,0.1242825
std,3931718.0,536706.8,733756900.0,0.3937267
min,0.0,0.0,1659305000000.0,0.0
1%,42464.0,17632.0,1659350000000.0,0.0
5%,219172.0,91256.0,1659449000000.0,0.0
10%,494928.0,182206.0,1659558000000.0,0.0
20%,1157152.0,371630.0,1659823000000.0,0.0
30%,1949009.0,558573.0,1660076000000.0,0.0
40%,2934384.0,743977.0,1660336000000.0,0.0


In [6]:
pdx.value_counts(df["type"])

Unnamed: 0,count,percent
0,201013586,0.89881
1,17466202,0.078098
2,5164431,0.023092


In [7]:
%%time
assert df.notna().all(axis=None)
df.to_parquet("output/transactions.parquet", index=False)

Wall time: 17.4 s
