In [1]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Dict

In [2]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [3]:
# already sorted by session, timestamp
df = pd.read_parquet("input/transactions.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216716096 entries, 0 to 216716095
Data columns (total 4 columns):
 #   Column   Dtype
---  ------   -----
 0   session  int32
 1   aid      int32
 2   ts       int64
 3   type     int8 
dtypes: int32(2), int64(1), int8(1)
memory usage: 3.4 GB


In [4]:
vocab: Dict[str, int] = {
    "<s>": 0,
    "<pad>": 1,
    "</s>": 2,
    "<unk>": 3,
    "<mask>": 4,
    "<cls>": 5,
    "<click>": 6,
    "<cart>": 7,
    "<order>": 8,
}
_next = len(vocab)
prev = None
words = []
rows = []
for t in tqdm(df.itertuples()):
    curr = int(getattr(t, "session"))
    if prev is not None and prev!=curr and len(words)!=0:
        rows.append({
            "session": prev,
            "seq": " ".join(words),
            "length": len(words),
        })
        words = []
    prev = curr
    _type = int(getattr(t, "type"))
    first = "<click>"
    if _type==1:
        first = "<cart>"
    elif _type==2:
        first = "<order>"
    aid = str(getattr(t, "aid"))
    if aid not in vocab:
        vocab[aid] = _next
        _next += 1
    second = aid
    words += [first, second]
if len(words)!=0:
    rows.append({
        "session": prev,
        "seq": " ".join(words),
        "length": len(words),
    })

216716096it [08:42, 414826.93it/s]


In [5]:
print(f"len(vocab)={len(vocab):,}")
with open("output/vocab.json", "w") as f:
    json.dump(vocab, f)
with open("output/vocab.txt", "w") as f:
    words = [""] * len(vocab)
    for k,v in vocab.items():
        words[v] = k
    for w in words:
        f.write(f"{w}\n")

len(vocab)=1,855,612


In [6]:
df = pd.DataFrame.from_records(rows)
cols = ["session"]
df[cols] = df[cols].astype(np.int32)
cols = ["length"]
df[cols] = df[cols].astype(np.int16)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12899779 entries, 0 to 12899778
Data columns (total 3 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   session  int32 
 1   seq      object
 2   length   int16 
dtypes: int16(1), int32(1), object(1)
memory usage: 172.2+ MB


In [7]:
df.describe(percentiles=percentiles)

Unnamed: 0,session,length
count,12899780.0,12899780.0
mean,6449889.0,33.59997
std,3723846.0,67.15476
min,0.0,4.0
1%,128997.8,4.0
5%,644988.9,4.0
10%,1289978.0,4.0
20%,2579956.0,6.0
30%,3869933.0,6.0
40%,5159911.0,8.0


In [8]:
df.head()

Unnamed: 0,session,seq,length
0,0,<click> 1517085 <click> 1563459 <click> 1309446 <click> 16246 <click> 1781822 <click> 1152674 <cart> 1649869 <cart> 461689 <order> 305831 <order> 461689 <click> 362233 <click> 1649869 <click> 1649869 <click> 984597 <click> 1649869 <click> 803544 <click> 1110941 <click> 1190046 <click> 1760685 <click> 631008 <click> 461689 <click> 1190046 <click> 1650637 <click> 313546 <click> 1650637 <click> 979517 <click> 351157 <click> 1062149 <click> 1157384 <click> 1841388 <click> 1469630 <click> 305831 <click> 1110548 <click> 1110548 <click> 305831 <click> 1650114 <click> 1604396 <click> 1009750 <click> 1800933 <click> 495779 <click> 394655 <click> 495779 <click> 789245 <cart> 789245 <click> 366890 <click> 361317 <click> 1700164 <click> 1755597 <click> 789245 <click> 784978 <click> 1171505 <click> 784978 <click> 1700164 <click> 784978 <click> 1521766 <click> 1725503 <click> 528847 <click> 1816325 <click> 984597 <click> 1072782 <click> 173702 <click> 1072782 <click> 1407538 <click> 1629651 <click> 1768568 <click> 1318324 <click> 1840418 <click> 1813509 <click> 1813509 <click> 667924 <click> 1226444 <click> 709550 <click> 709417 <click> 1225559 <click> 1048044 <click> 1052813 <click> 1225559 <click> 240346 <click> 1582117 <click> 1707783 <click> 1624436 <click> 1157411 <click> 358305 <click> 1202970 <click> 832192 <click> 1498443 <click> 723931 <click> 1436439 <click> 1693461 <click> 1206554 <click> 1110741 <click> 346352 <click> 1802050 <click> 154930 <click> 964169 <click> 964169 <click> 823637 <click> 964169 <click> 1411683 <click> 964169 <click> 1167722 <click> 964169 <click> 1619737 <click> 964169 <click> 1840615 <click> 512756 <click> 946219 <click> 1090479 <click> 1164387 <click> 1308544 <click> 719622 <click> 1750538 <click> 1443747 <click> 1750538 <click> 337364 <click> 1653945 <click> 1222638 <click> 1622987 <click> 608383 <click> 1460239 <click> 1436439 <click> 321397 <click> 828625 <click> 1624436 <click> 1157411 <click> 1537907 <click> 1070142 <click> 959208 <click> 275288 <click> 1318324 <click> 1072782 <click> 1072782 <click> 173702 <click> 1428075 <click> 892659 <click> 1127565 <click> 1072782 <click> 97836 <click> 384343 <click> 218130 <click> 294248 <click> 166547 <click> 504365 <click> 102416 <click> 30373 <click> 724999 <click> 1110548 <click> 10268 <click> 219925 <click> 1140855 <click> 915702 <click> 480578 <click> 1145803 <click> 480578 <click> 516456 <click> 536842 <click> 1446430 <click> 581169 <click> 976007 <click> 667322 <click> 1801525 <click> 1767945 <click> 1342014 <click> 1813509 <click> 1785321 <click> 1813509 <click> 1840615 <click> 1689148 <click> 1840615 <click> 72748 <click> 1049280 <click> 1436439 <click> 6851 <click> 1433235 <click> 7563 <click> 1766089 <click> 7563 <click> 570955 <click> 1766089 <click> 570955 <cart> 974651 <click> 974651 <cart> 974651 <cart> 974651 <cart> 280978 <cart> 1521766 <click> 661144 <cart> 1760145 <click> 1639229 <click> 1624436 <click> 738987 <click> 1436439 <click> 102416 <click> 190818 <click> 1157411 <click> 138431 <click> 543308 <click> 1760145 <click> 543308 <cart> 275288 <click> 275288 <click> 1760145 <click> 974651 <cart> 974651 <click> 1760145 <click> 570955 <click> 661144 <click> 362233 <click> 362233 <click> 1760145 <click> 275288 <click> 974651 <click> 543308 <click> 624343 <click> 543308 <click> 723612 <click> 543308 <cart> 543308 <cart> 543308 <click> 442293 <cart> 442293 <click> 442293 <click> 442293 <click> 171073 <click> 851778 <click> 976134 <click> 856506 <cart> 1549618 <click> 419161 <click> 543308 <click> 1549618 <click> 760277 <click> 1549618 <click> 702179 <click> 1549618 <click> 1587393 <click> 76358 <click> 1549618 <click> 1199474 <cart> 1199474 <click> 1386923 <click> 1055124 <click> 859697 <click> 1055124 <click> 1386923 <order> 1199474 <order> 543308 <click> 961113 <click> 883849 <click> 701766 <click> 924751 <click> 168206 <click> 924751 <click> 219033 <click> 171982 <click> 1319939 <click> 1349536 <click> 334392 <click> 1349536 <click> 1349536 <click> 165096 <click> 315914 <cart> 315914 <click> 1680276 <click> 1818905 <click> 334392 <click> 1048797 <click> 1048797 <click> 543308 <click> 341626 <click> 219925 <click> 843110 <click> 938007 <click> 1228848 <click> 1740927 <click> 161938,552
1,1,<cart> 424964 <click> 1492293 <cart> 1492293 <click> 910862 <cart> 910862 <click> 1491172 <cart> 1491172 <click> 424964 <click> 1515526 <click> 440486 <click> 109488 <click> 1507622 <click> 1734061 <click> 854637 <cart> 854637 <click> 718983 <click> 215311 <cart> 215311 <click> 718983 <click> 711125 <cart> 711125 <click> 50049 <click> 105393 <cart> 105393 <click> 959544 <click> 1734061 <click> 1842593 <click> 1464360 <click> 207905 <click> 1628317 <click> 376932 <click> 497868,64
2,2,<click> 763743 <click> 137492 <click> 504789 <click> 137492 <click> 795863 <click> 378348 <click> 795863 <click> 26638 <click> 817441 <click> 1405904 <click> 545290 <click> 935830 <click> 935830 <click> 1593105 <click> 427698 <click> 414004 <click> 465360 <click> 526287 <click> 567119 <click> 1577398 <cart> 161269 <click> 1577398 <click> 78519 <click> 1605583 <click> 690631 <click> 295985 <click> 553269 <click> 555062 <click> 1605711 <click> 485582 <click> 477910 <click> 808782 <click> 672473,66
3,3,<cart> 1425967 <click> 1425967 <click> 1343406 <click> 1343406 <cart> 1343406 <click> 1425967 <click> 1343406 <click> 1815570 <click> 287008 <click> 1343406 <click> 1425967 <click> 1809571 <click> 1089061 <click> 746072 <click> 1015324 <click> 1215662 <click> 1089061 <cart> 1089061 <click> 1089061 <order> 357461 <order> 1343406 <order> 1425967 <click> 1425967 <click> 1343406 <click> 357461 <click> 357461 <click> 357461 <click> 1343406 <click> 1343406 <click> 1343406 <click> 1105267 <click> 1343406 <click> 392840 <click> 574683 <click> 825711 <click> 1622133 <click> 1065620 <click> 713701 <click> 1343406 <click> 1343406 <click> 1632093 <click> 1090485 <click> 1632093 <click> 1305368 <click> 968885 <click> 2512 <click> 1343406 <click> 357461 <click> 984459 <cart> 984459 <click> 984459 <click> 622368 <click> 578649 <click> 1253857 <click> 1660613 <cart> 1660613 <click> 840023 <click> 774320 <click> 300295 <click> 774320 <cart> 774320 <click> 722275 <click> 979687 <click> 1402624 <click> 308831 <click> 308831 <cart> 308831 <click> 471114 <click> 308831 <click> 127764 <click> 1018433 <cart> 1018433 <click> 1018433 <click> 714724 <cart> 714724 <click> 714724 <click> 1018433 <click> 714724 <click> 1221074 <click> 1171628 <click> 284722 <click> 490305 <click> 1436022 <click> 1198969 <click> 763925 <click> 880139 <click> 727609 <click> 528531 <click> 722275 <click> 1584672 <click> 526790 <click> 1554339 <cart> 1554339 <click> 711446 <click> 128326 <cart> 128326 <click> 128326 <click> 536803 <click> 1348328 <click> 74985 <click> 642154 <click> 705264 <click> 87629 <click> 1808351 <cart> 1808351 <click> 1808351 <click> 749898 <click> 1772909 <click> 905009 <click> 284722 <click> 54857 <cart> 54857 <click> 1528155 <click> 1712999 <click> 692635 <click> 1500975 <click> 771063 <click> 547781 <click> 692635 <click> 54857 <click> 1002270 <click> 254870 <click> 1504981 <cart> 1504981 <click> 254870 <click> 1504981 <click> 1377572 <click> 1009427 <click> 1712999 <cart> 1712999 <click> 54857 <click> 1712999 <click> 1712999 <click> 54857 <click> 1712999 <click> 54857 <click> 1712999 <click> 54857 <click> 1018433 <click> 54857 <click> 54857 <click> 1018433 <click> 1143262 <click> 836094 <click> 1498272 <click> 3999 <click> 1359109 <click> 1119655 <click> 708908 <click> 588531 <click> 116139 <cart> 116139 <click> 96976 <click> 116139 <click> 1498013 <click> 332817 <cart> 332817 <click> 812246 <cart> 812246 <click> 822461 <click> 636903 <click> 172698 <click> 1533194 <click> 1802976 <click> 360657 <click> 1111431 <click> 172051 <click> 340594 <click> 123362 <click> 1633901 <click> 88711 <click> 1325760 <click> 1202088 <click> 672286 <click> 614085 <click> 1713400 <click> 1748862 <click> 108125 <click> 1332980 <click> 93934 <click> 532565 <click> 1282243 <click> 1708257 <click> 37450 <click> 33731 <click> 546144 <click> 1175830 <click> 1152906 <click> 1754057 <click> 1443710 <click> 1001000 <click> 1124086 <click> 1278281 <click> 1536010 <click> 904809 <click> 659496 <click> 1799121 <click> 693368 <order> 1018433 <order> 54857 <click> 618373 <click> 618373 <click> 618373 <click> 1018433 <click> 54857 <click> 54857 <click> 282175 <click> 1061225 <click> 802541 <click> 802541 <click> 1397226 <click> 1397226 <click> 1638009 <cart> 1638009 <click> 656980 <click> 466658 <click> 1251514 <click> 1261998 <cart> 1261998 <click> 1575102 <click> 1575102 <click> 1261998 <click> 1431783 <click> 925352 <click> 925352 <cart> 925352,452
4,4,<click> 613619 <click> 298827 <order> 298827 <click> 383828 <click> 255379 <click> 1838173 <click> 1453726 <click> 1838173 <click> 255379 <click> 383828 <click> 1554752 <cart> 1554752 <click> 917213 <cart> 917213 <click> 758750 <cart> 758750 <click> 678521 <click> 1081407 <click> 479396,38


In [9]:
%%time
assert df.notna().all(axis=None)
df.to_parquet("output/sequences.parquet", index=False)

Wall time: 12.7 s
