# OTTO: Co-visitation Matrix

There exist products that are frequently viewed and bought together. Here we leverage this idea by computing a co-visitation matrix of products. It's done in the following way:

1. First we look at all pairs of events within the same session that are close to each other in time (< 1 day). We compute co-visitation matrix $M_{aid1,aid2}$ by counting global number of event pairs for each pair across all sessions.
2. For each $aid1$ we find top 20 most frequent aid2:  `aid2=argsort(M[aid])[-20:]`
3. We produce test results by concatenating `tail(20)` of test session events (see https://www.kaggle.com/code/simamumu/old-test-data-last-20-aid-get-lb0-947) with the most likely recommendations from co-visitation matrix. These recommendations are generated from session AIDs and `aid2` from the step 2


**Please, smash that thumbs up button and subscribe if you like this notebook!**

## Utils, imports

In [None]:
# numpy: mathematical functions, arrays and matrices, random number generators, linear algebra routines, Fourier transforms, and more
import numpy as np

# pandas: data analysis and manipulation tool
import pandas as pd

# tqdm: output a smart progress bar by wrapping around any iterable
from tqdm.notebook import tqdm

# glob: finds all the pathnames matching a specified pattern according to the rules used by the Unix shell
import glob

# multiprocessing: API for dividing work between multiple processes
import multiprocessing

# os: functions for interacting with the operating system
import os

# pickle:  converting a Python object into a byte stream to store it in a file/database, maintain program state across sessions, or transport data over the network
import pickle

# defaultdict: functionality of both dictionaries and defaultdict are almost same except for the fact that defaultdict never raises a KeyError. It provides a default value for the key that does not exists
from collections import defaultdict

# Counter: container that will hold the count of each of the elements present in the container
from collections import Counter

# The DEBUG=True , if there is error, page will show details of error. if DEBUG=False , the ALLOWED_HOSTS of settings.py will work, you should take carefully to set it
DEBUG=False


SAMPLING = 1  # Reduce it to improve performance

In [None]:
TOP_40_CACHE = 'top_40_pairs.pkl'
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("gcloud")

    with open('/tmp/json', 'w+') as f:
        f.write(secret_value_0)
        
    !gcloud auth login --cred-file /tmp/json    
    !gsutil cp gs://nesp/top_40_pairs.pkl .        
        
except Exception  as ex:
    pass


## Generate AID pairs

In [None]:
# sys: manipulate different parts of the Python runtime environment
import sys

# gc = garbage collector: detects objects with reference cycles
import gc

def gen_pairs(df):
    
    # df.query: Query the columns of a DataFrame with a boolean expression
    # df.groupby: split the data into groups based on some criteria
    # df.apply(): Apply a function along an axis of the DataFrame
    df = df.query('session % @SAMPLING == 0').groupby('session', as_index=False, sort=False).apply(lambda g: g.tail(30)).reset_index(drop=True)
    
    # pd.merge: join is done on columns or indexes
    df = pd.merge(df, df, on='session')
    pairs = df.query('abs(ts_x - ts_y) < 24 * 60 * 60 * 1000 and aid_x != aid_y')[['session', 'aid_x', 'aid_y']].drop_duplicates()
    return pairs[['aid_x', 'aid_y']].values
    

    
    
def gen_aid_pairs():
    all_pairs = defaultdict(lambda: Counter())
    all_pair_chunks = []
    with tqdm(glob.glob('../input/otto-chunk-data-inparquet-format/*_parquet/*'), desc='Chunks') as prog:
        for idx, chunk_file in enumerate(prog):
            with multiprocessing.Pool() as p:            
                chunk = pd.read_parquet(chunk_file).drop(columns=['type'])
                pair_chunks = p.map(gen_pairs, np.array_split(chunk, 120))            
                pair_chunks = np.concatenate(pair_chunks, axis=0)
                all_pair_chunks.append(pair_chunks)

                if DEBUG and idx >= 3:
                    break
                del chunk, pair_chunks
                gc.collect()
                
                
    df = pd.DataFrame(data=np.concatenate(all_pair_chunks), columns=['aid1', 'aid2'])
    top_aids = df.groupby('aid1').apply(lambda df: Counter(df.aid2).most_common(40)).to_dict()
    return top_aids

In [None]:
if os.path.exists(TOP_40_CACHE):
    print('Reading top40 AIDs from cache')
    top_40 = pickle.load(open(TOP_40_CACHE, 'rb'))
else:
    top_40 = gen_aid_pairs()    
    with open('top_40_pairs.pkl', 'wb') as f:
        pickle.dump(top_40, f)
    !gsutil cp top_40_pairs.pkl gs://nesp/
        
len(top_40)

In [None]:
for i, (k, v) in enumerate(top_40.items()):
    print(k, v)
    if i > 10:
        break

## Test set inference

In [None]:
top_40_cnt = {aid: Counter(dict(top)) for aid, top in top_40.items()}

In [None]:
for i, (k, v) in enumerate(top_40_cnt.items()):
    print(k, v)
    if i > 3:
        break

In [None]:
def load_test():    
    dfs = []
    for e, chunk_file in enumerate(tqdm(glob.glob('../input/otto-chunk-data-inparquet-format/test_parquet/*'))):
        chunk = pd.read_parquet(chunk_file)
        dfs.append(chunk)

    return pd.concat(dfs).reset_index(drop=True).astype({"ts": "datetime64[ms]"})

In [None]:
test_df = load_test()

In [None]:
import itertools

def suggest_aids(df):
    aids = df.tail(20).aid.tolist()
    
    if len(aids) >= 20:
        # We have enough events in the test session
        return aids
    
    # Append it with AIDs from the co-visitation matrix. 
    aids = set(aids)
    new_aids = Counter()
    for aid in aids:
        new_aids.update(top_40_cnt.get(aid, Counter()))
    
    top_aids2 = [aid2 for aid2, cnt in new_aids.most_common(20) if aid2 not in aids]        
    return list(aids) + top_aids2[:20 - len(aids)]

        

In [None]:
pred_df = test_df.sort_values(["session", "type", "ts"]).groupby(["session"]).apply(
    lambda x: suggest_aids(x)
)

In [None]:
clicks_pred_df = pd.DataFrame(pred_df.add_suffix("_clicks"), columns=["labels"]).reset_index()
orders_pred_df = pd.DataFrame(pred_df.add_suffix("_orders"), columns=["labels"]).reset_index()
carts_pred_df = pd.DataFrame(pred_df.add_suffix("_carts"), columns=["labels"]).reset_index()

In [None]:
pred_df

In [None]:
pred_df = pd.concat(
    [clicks_pred_df, orders_pred_df, carts_pred_df]
)
pred_df.columns = ["session_type", "labels"]
pred_df["labels"] = pred_df.labels.apply(lambda x: " ".join(map(str,x)))
pred_df.to_csv("submission.csv", index=False)