In [1]:
!pip install ../input/python-wheels/faiss_cpu-1.7.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

Processing /kaggle/input/python-wheels/faiss_cpu-1.7.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.2
[0m

In [2]:
import faiss
import random
from pathlib import Path
import os
import sys
import gc
import shutil
import json
import math
import numpy as np
import pandas as pd
import sklearn
import torch
from tqdm import tqdm
from typing import Dict, List, Tuple, NamedTuple, Callable, Iterable, Set, Optional, Any

In [3]:
class Conf(NamedTuple):
    debug: bool = False
    input_dir: Path = Path("/kaggle/input")
    comp_dir: Path = input_dir / "otto-recommender-system"
    temp_dir: Path = Path('/kaggle/temp')
    working_dir: Path = Path('/kaggle/working')
    resource_dir: Path = input_dir / "lib-otto-2022/otto2022-1.0"
    vocab_file: Path = resource_dir / "data/vocab3.json"
    em_file: Path = resource_dir / "data/m8_w7_i20.npy"
    index_file: Path = resource_dir / "data/m8_w7_i20.index"
    id2txn: List[str] = ["clicks", "carts", "orders"]
    txn2id: Dict = {v:k for k,v in enumerate(id2txn)}
    search_k: int = 30
    search_n: int = 150
    step_factor: float = 1/136
    

conf = Conf()
print(conf)

Conf(debug=False, input_dir=PosixPath('/kaggle/input'), comp_dir=PosixPath('/kaggle/input/otto-recommender-system'), temp_dir=PosixPath('/kaggle/temp'), working_dir=PosixPath('/kaggle/working'), resource_dir=PosixPath('/kaggle/input/lib-otto-2022/otto2022-1.0'), vocab_file=PosixPath('/kaggle/input/lib-otto-2022/otto2022-1.0/data/vocab3.json'), em_file=PosixPath('/kaggle/input/lib-otto-2022/otto2022-1.0/data/m8_w7_i20.npy'), index_file=PosixPath('/kaggle/input/lib-otto-2022/otto2022-1.0/data/m8_w7_i20.index'), id2txn=['clicks', 'carts', 'orders'], txn2id={'clicks': 0, 'carts': 1, 'orders': 2}, search_k=30, search_n=150, step_factor=0.007352941176470588)


In [4]:
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')
else:
    print("cpu")

cpu


In [5]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
#sys.path.append(str(conf.input_dir / "networkx/networkx-networkx-2.6.3"))
sys.path.append(str(conf.input_dir / "sgcharts-ml/src"))
sys.path.append(str(conf.resource_dir / "src"))
#import networkx as nx
import scml
from scml import nlp as snlp
from scml import pandasx as pdx
#from scml.nlp import clustering as snc 
import mylib
scml.seed_everything()

In [6]:
with open(str(conf.vocab_file)) as f:
    vocab = json.load(f)
id2article: List[int] = [-1]*len(vocab)
article2id: Dict[int, int] = {}
i = 0
for aid, _ in vocab:
    a = int(aid)
    id2article[i] = a
    article2id[a] = i
    i+=1
print(f"len(id2article)={len(id2article):,}\n{id2article[:10]}")

len(id2article)=1,855,603
[1460571, 485256, 108125, 29735, 1733943, 832192, 184976, 166037, 554660, 986164]


In [7]:
em = np.load(str(conf.em_file))
print(f"em.shape={em.shape}")

em.shape=(5566809, 32)


In [8]:
index = faiss.read_index(str(conf.index_file))
print(f"is_trained={index.is_trained}, ntotal={index.ntotal:,}")
assert index.ntotal==em.shape[0]

is_trained=True, ntotal=5,566,809


# Json to Transactions

In [9]:
rows = []
queries = []
with open(str(conf.comp_dir / "test.jsonl")) as lines:
    for line in tqdm(lines, mininterval=10):
        jo = json.loads(line)
        session = int(jo["session"])
        prev = None
        n = len(jo["events"])
        for i, event in enumerate(jo["events"]):
            curr = int(event["ts"])
            if prev is not None and curr<prev:
                raise ValueError("event out-of-order")
            aid = int(event["aid"])
            tid = conf.txn2id[event["type"]]
            j = article2id[aid] * len(conf.txn2id) + tid
            queries.append(em[j])
            rows.append({
                "session": session,
                "aid": aid,
                #"ts": curr,
                "recency": -1 * (i-(n-1)),
                "type": tid,
            })
            prev = curr
queries = np.array(queries, dtype=np.float32)
print(f"queries.shape={queries.shape}")

1671803it [00:35, 47334.89it/s]


queries.shape=(6928123, 32)


In [10]:
df = pd.DataFrame.from_records(rows)
cols = ["session", "aid", "recency"]
df[cols] = df[cols].astype(np.int32)
cols = ["type"]
df[cols] = df[cols].astype(np.int8)
#df.sort_values(["session", "ts"], inplace=True, ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6928123 entries, 0 to 6928122
Data columns (total 4 columns):
 #   Column   Dtype
---  ------   -----
 0   session  int32
 1   aid      int32
 2   recency  int32
 3   type     int8 
dtypes: int32(3), int8(1)
memory usage: 85.9 MB


In [11]:
df.head(10)

Unnamed: 0,session,aid,recency,type
0,12899779,59625,0,0
1,12899780,1142000,4,0
2,12899780,582732,3,0
3,12899780,973453,2,0
4,12899780,736515,1,0
5,12899780,1142000,0,0
6,12899781,141736,10,0
7,12899781,199008,9,0
8,12899781,57315,8,0
9,12899781,194067,7,0


In [12]:
del em, rows, lines, vocab
gc.collect()

21

# Inference

In [13]:
class Candidate(NamedTuple):
    dist: float
    aid: int


sub = pd.read_csv(str(conf.comp_dir / "sample_submission.csv"))
res: Dict[str, List[Candidate]] = {}
for t in tqdm(sub.itertuples(), mininterval=10):
    session_type = str(getattr(t, "session_type"))
    res[session_type] = []

5015409it [00:14, 352217.37it/s]


In [14]:
%%time
index.nprobe = conf.search_n
distances, ids = index.search(queries, conf.search_k)
print(f"distances.shape={distances.shape}\nids.shape={ids.shape}")
#distances = distances.astype(np.float16)

distances.shape=(6928123, 30)
ids.shape=(6928123, 30)
CPU times: user 14h 41min 26s, sys: 49.9 s, total: 14h 42min 16s
Wall time: 3h 45min 25s


In [15]:
del queries, index
gc.collect()

42

# Postprocess

In [16]:
i = 0
for t in tqdm(df.itertuples(), mininterval=10):
    sid = int(getattr(t, "session"))
    step = int(getattr(t, "recency"))
    for j in range(ids.shape[1]):
        k = int(ids[i][j] / len(conf.id2txn))
        aid = id2article[k]
        txn = conf.id2txn[ids[i][j] % len(conf.id2txn)]
        res[f"{sid}_{txn}"].append(
            Candidate(
                dist=distances[i][j] * np.exp(step * conf.step_factor),
                aid=aid,
            )
        )
    i+=1

6928123it [32:19, 3572.89it/s]


In [17]:
del df, ids, distances
gc.collect()

21

In [18]:
top = 20
rows = []
for st,candidates in tqdm(res.items(), mininterval=10):
    a = list(candidates)
    if len(a)<top:
        sid, txn = tuple(st.split("_"))
        alt_txns = ["orders", "carts", "clicks"]
        alt_txns.remove(txn)
        i = 0
        while len(a)<top and i<len(alt_txns):
            a += res["_".join([sid, alt_txns[i]])]
            i+=1
    # no duplicate aids
    a.sort()
    b = []
    i = 0
    seen: Set[int] = set()
    while len(b)<top and i<len(a):
        if a[i].aid not in seen:
            seen.add(a[i].aid)
            b.append(a[i])
        i+=1
    rows.append({
        "session_type": st,
        "labels": " ".join(str(c.aid) for c in b),
    })

100%|██████████| 5015409/5015409 [04:40<00:00, 17884.58it/s]


In [19]:
sub = pd.DataFrame.from_records(rows)
sub.to_csv("submission.csv", index=False)
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5015409 entries, 0 to 5015408
Data columns (total 2 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   session_type  object
 1   labels        object
dtypes: object(2)
memory usage: 76.5+ MB


In [20]:
sub.head(10)

Unnamed: 0,session_type,labels
0,12899779_clicks,59625 739920 44003 225835 653716 1315541 1738007 106205 91558 362884 33136 300672 1408031 1378973 331360 1314286 1300371 933671 50509 898293
1,12899779_carts,59625 739920 44003 225835 653716 1315541 1738007 106205 91558 362884 33136 300672 1408031 1378973 331360 1314286 1300371 933671 50509 898293
2,12899779_orders,59625 739920 44003 225835 653716 1315541 1738007 106205 91558 362884 33136 300672 1408031 1378973 331360 1314286 1300371 933671 50509 898293
3,12899780_clicks,736515 582732 973453 1419849 1735169 1142000 1712906 1524585 1056217 1537673 1758603 77422 1157882 487136 760500 1388466 837443 1032776 884502 1633746
4,12899780_carts,736515 582732 973453 1419849 1735169 1142000 1712906 1524585 1056217 1537673 1758603 77422 1157882 487136 760500 1388466 837443 1032776 884502 1633746
5,12899780_orders,736515 582732 973453 1419849 1735169 1142000 1712906 1524585 1056217 1537673 1758603 77422 1157882 487136 760500 1388466 837443 1032776 884502 1633746
6,12899781_clicks,141736 194067 199008 918667 57315 399703 1173473 1119163 1628918 1125095 1371398 640599 1496238 551645 959548 1777876 399992 473415 734026 1278664
7,12899781_carts,199008 210917 602249 1378031 833302 688932 1571584 601874 400500 1814567 623052 424835 1608542 1107961 1340329 452500 1504983 852793 1632472 44788
8,12899781_orders,199008 210917 602249 1378031 833302 688932 1571584 601874 400500 1814567 623052 424835 1608542 1107961 1340329 452500 1504983 852793 1632472 44788
9,12899782_clicks,229748 1674681 638410 413962 1494780 779477 542617 300208 404110 1050025 652735 856980 1274530 1844117 1066364 1547228 147414 139799 1082119 259674


# Debug

In [21]:
#!pip list