In [1]:
import json
from collections import Counter
import networkx as nx
from networkx.readwrite import json_graph
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Dict, List
import scml

In [2]:
tim = scml.Timer()
tim.start()
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [3]:
# already sorted by session, timestamp
df = pd.read_parquet("input/transactions.parquet")
#df = df.iloc[:1000]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223644219 entries, 0 to 223644218
Data columns (total 4 columns):
 #   Column   Dtype
---  ------   -----
 0   session  int32
 1   aid      int32
 2   ts       int64
 3   type     int8 
dtypes: int32(2), int64(1), int8(1)
memory usage: 3.5 GB


In [4]:
wc = Counter()
prev_sess = None
prev = None
edges = Counter()
types: List[str] = ["click", "cart", "order"]
for t in tqdm(df.itertuples()):
    curr_sess = int(getattr(t, "session"))
    if prev_sess is not None and prev_sess!=curr_sess:
        prev = None
    prev_sess = curr_sess
    aid = str(getattr(t, "aid"))
    _type = types[int(getattr(t, "type"))]
    curr = f"{aid}_{_type}"
    wc[aid]+=1
    if prev is not None and prev!=curr:
        edges[(prev, curr)]+=1
    prev = curr

223644219it [13:45, 270940.08it/s]


In [5]:
print(f"{len(wc)} words\n{wc.most_common(10)}")

1855603 words
[('1460571', 137874), ('485256', 135892), ('108125', 124885), ('29735', 116215), ('1733943', 106512), ('832192', 94766), ('184976', 92890), ('166037', 86333), ('554660', 83865), ('986164', 81557)]


In [6]:
s = pd.Series(wc.values())
s.describe(percentiles=percentiles)

count    1.855603e+06
mean     1.205237e+02
std      7.512637e+02
min      3.000000e+00
1%       5.000000e+00
5%       6.000000e+00
10%      6.000000e+00
20%      9.000000e+00
30%      1.100000e+01
40%      1.500000e+01
50%      2.000000e+01
60%      2.900000e+01
70%      4.500000e+01
80%      7.800000e+01
90%      1.890000e+02
95%      4.100000e+02
99%      1.732000e+03
max      1.378740e+05
dtype: float64

In [7]:
%%time
with open("output/vocab3.json", "w") as f:
    json.dump(wc.most_common(), f)

Wall time: 8.52 s


In [8]:
print(edges.most_common(10))

[(('485256_click', '485256_cart'), 27336), (('485256_cart', '485256_click'), 19103), (('152547_click', '152547_cart'), 15021), (('166037_click', '166037_cart'), 12301), (('29735_click', '832192_click'), 11408), (('33343_click', '33343_cart'), 10174), (('1733943_click', '1733943_cart'), 9762), (('231487_click', '231487_cart'), 9413), (('29735_click', '29735_cart'), 9300), (('832192_click', '29735_click'), 9013)]


In [9]:
g = nx.DiGraph()
g.add_weighted_edges_from((u, v, w) for (u, v), w in edges.most_common())
print(g)

DiGraph with 3764159 nodes and 100953434 edges


In [10]:
%%time
with open(f"output/graph.json", "w") as f:
    json.dump(json_graph.adjacency_data(g), f)

Wall time: 27min 19s


In [11]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:48:47.894346
