In [1]:
import json
from collections import Counter
import networkx as nx
from networkx.readwrite import json_graph
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Dict, List
import scml

In [2]:
tim = scml.Timer()
tim.start()
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [3]:
# already sorted by session, timestamp
df = pd.read_parquet("input/transactions.parquet")
#df = df.iloc[:1000]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216716096 entries, 0 to 216716095
Data columns (total 4 columns):
 #   Column   Dtype
---  ------   -----
 0   session  int32
 1   aid      int32
 2   ts       int64
 3   type     int8 
dtypes: int32(2), int64(1), int8(1)
memory usage: 3.4 GB


In [4]:
wc = Counter()
prev_sess = None
prev = None
edges = Counter()
types: List[str] = ["click", "cart", "order"]
for t in tqdm(df.itertuples()):
    curr_sess = int(getattr(t, "session"))
    if prev_sess is not None and prev_sess!=curr_sess:
        prev = None
    prev_sess = curr_sess
    aid = str(getattr(t, "aid"))
    _type = types[int(getattr(t, "type"))]
    curr = f"{aid}_{_type}"
    wc[curr]+=1
    if prev is not None and prev!=curr:
        edges[(prev, curr)]+=1
    prev = curr

216716096it [13:22, 269970.54it/s]


In [5]:
print(f"{len(wc)} words\n{wc.most_common(10)}")

3748278 words
[('1460571_click', 121287), ('108125_click', 114456), ('29735_click', 101148), ('485256_click', 97154), ('1733943_click', 91395), ('184976_click', 85122), ('832192_click', 81127), ('1502122_click', 73805), ('554660_click', 72161), ('1603001_click', 68519)]


In [6]:
s = pd.Series(wc.values())
s.describe(percentiles=percentiles)

count    3.748278e+06
mean     5.781751e+01
std      4.693039e+02
min      1.000000e+00
1%       1.000000e+00
5%       1.000000e+00
10%      1.000000e+00
20%      2.000000e+00
30%      3.000000e+00
40%      6.000000e+00
50%      8.000000e+00
60%      1.100000e+01
70%      1.800000e+01
80%      3.200000e+01
90%      7.900000e+01
95%      1.780000e+02
99%      8.640000e+02
max      1.212870e+05
dtype: float64

In [7]:
%%time
with open("output/vocab2.json", "w") as f:
    json.dump(wc.most_common(), f)

Wall time: 18.8 s


In [8]:
print(edges.most_common(10))

[(('485256_click', '485256_cart'), 25468), (('485256_cart', '485256_click'), 18067), (('152547_click', '152547_cart'), 14688), (('166037_click', '166037_cart'), 12093), (('29735_click', '832192_click'), 11173), (('33343_click', '33343_cart'), 9770), (('1733943_click', '1733943_cart'), 9651), (('231487_click', '231487_cart'), 9333), (('29735_click', '29735_cart'), 9073), (('832192_click', '29735_click'), 8765)]


In [9]:
g = nx.DiGraph()
g.add_weighted_edges_from((u, v, w) for (u, v), w in edges.most_common())
print(g)

DiGraph with 3747723 nodes and 99004731 edges


In [10]:
%%time
with open(f"output/graph.json", "w") as f:
    json.dump(json_graph.adjacency_data(g), f)

Wall time: 25min 18s


In [11]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:47:35.285127
