In [1]:
%load_ext autoreload
%autoreload 2

In [77]:
from pathlib import Path
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import dill
import pandas as pd
import polars as pl
import numpy as np
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.nearest_neighbours import CosineRecommender

In [3]:
DATA_FOLDER = Path("/data/otto/")

TRAIN_PROCESSED = DATA_FOLDER / "train_parquet"
TRAIN_PROCESSED.mkdir(parents=True, exist_ok=True)

TEST_PROCESSED = DATA_FOLDER / "test_parquet"
TEST_PROCESSED.mkdir(parents=True, exist_ok=True)

VALIDATION_PATH = DATA_FOLDER / "validation"
VALIDATION_PATH.mkdir(parents=True, exist_ok=True)

In [5]:
def make_sparse_matrix(df: pl.DataFrame):
    row = df["session"].to_numpy()
    col = df["aid"].to_numpy()
    weight = df["weight"].to_numpy().astype(np.float32)
    
    return csr_matrix((weight, (row, col)))

In [6]:
# load
val_df_train = pl.read_parquet(VALIDATION_PATH / "train.parquet", use_pyarrow=True)
val_df_valid_input = pl.read_parquet(VALIDATION_PATH / "valid.parquet", use_pyarrow=True)
val_df_valid_targets = pl.read_parquet(VALIDATION_PATH / "test_labels.parquet", use_pyarrow=True)

In [52]:
df = pl.concat([val_df_train, val_df_valid_input])

In [51]:
def dataset_action_weights(df):
    df_action_weights = pl.DataFrame({
        "type": [0, 1, 2], "weight": [10, 30, 60]
    })

    return (
        df
        .join(df_action_weights, on="type", how="inner")
        .groupby(["session", "aid"])
        .agg([
            pl.sum("weight")
        ])
    )

def dataset_clicks_only(df):
    return (
        df
        .filter(pl.col("type") == 0)
        .groupby(["session", "aid"])
        .agg([
            pl.lit(1).alias("weight")
        ])
    )

In [53]:
df_act_w = dataset_action_weights(df)
df_clicks = dataset_clicks_only(df)

In [62]:
als_clicks = AlternatingLeastSquares(
    factors=100,
    regularization=0.01,
    alpha=1.0,
    iterations=30,
    calculate_training_loss=True,
    use_gpu=True
)

In [63]:
train_data = make_sparse_matrix(df_clicks)
als_clicks.fit(train_data)

  0%|          | 0/30 [00:00<?, ?it/s]

In [69]:
bpr_clicks = BayesianPersonalizedRanking(
    factors=100,
    learning_rate=0.01,
    regularization=0.01,
    iterations=50,
    use_gpu=False
)

In [70]:
train_data = make_sparse_matrix(df_clicks)
bpr_clicks.fit(train_data)

  0%|          | 0/50 [00:00<?, ?it/s]

In [10]:
als = AlternatingLeastSquares(
    factors=100,
    regularization=0.01,
    alpha=1.0,
    iterations=30,
    calculate_training_loss=True,
    use_gpu=True
)

In [11]:
train_data = make_sparse_matrix(df)
als.fit(train_data)

  0%|          | 0/30 [00:00<?, ?it/s]

In [28]:
als_cpu = AlternatingLeastSquares(
    factors=100,
    regularization=0.01,
    alpha=1.0,
    iterations=5,
    calculate_training_loss=True,
    use_gpu=False
)

In [29]:
train_data = make_sparse_matrix(df)
als_cpu.fit(train_data)

  0%|          | 0/5 [00:00<?, ?it/s]

In [49]:
bpr_cpu = BayesianPersonalizedRanking(
    factors=100,
    learning_rate=0.01,
    regularization=0.01,
    iterations=50,
    use_gpu=False
)

In [50]:
train_data = make_sparse_matrix(df)
bpr_cpu.fit(train_data)

  0%|          | 0/50 [00:00<?, ?it/s]

In [78]:
i2i_cosine = CosineRecommender(K=40)

In [80]:
train_data = make_sparse_matrix(df_act_w)
i2i_cosine.fit(train_data)



  0%|          | 0/1855603 [00:00<?, ?it/s]

In [None]:
test_users = val_df_valid_input["session"].unique().to_list()[:100_000]

In [81]:
b_sz = 1000
topk = 20

recs = []
for test_session in tqdm(range(0, len(test_users), b_sz)):
    test_sessions = test_users[test_session : test_session + b_sz]
    rec_items, scores = i2i_cosine.recommend(
        test_sessions, user_items=train_data[test_sessions], N=topk,
        filter_already_liked_items=False, recalculate_user=False
    )
    recs.extend(rec_items.tolist())

  0%|          | 0/100 [00:00<?, ?it/s]

In [82]:
# make submission

submission_dict = {
    "session_type": [],
    "labels": [],
}

types = ["clicks", "carts", "orders"]
topk = 20


for test_session, rec_items in tqdm(zip(test_users, recs)):
    session_types = [f"{test_session}_{t}" for t in types]
    labels = " ".join(str(aid) for aid in rec_items)
    labels_list = [labels] * 3

    submission_dict["session_type"].extend(session_types)
    submission_dict["labels"].extend(labels_list)

0it [00:00, ?it/s]

In [83]:
df_submission = pl.DataFrame(submission_dict)

In [84]:
df_submission

session_type,labels
str,str
"""11107743_click...","""1080743 495748..."
"""11107743_carts...","""1080743 495748..."
"""11107743_order...","""1080743 495748..."
"""11107744_click...","""859590 575644 ..."
"""11107744_carts...","""859590 575644 ..."
"""11107744_order...","""859590 575644 ..."
"""11107745_click...","""1099202 577762..."
"""11107745_carts...","""1099202 577762..."
"""11107745_order...","""1099202 577762..."
"""11107746_click...","""1700227 127442..."


In [44]:
def calc_valid_score(df_submission: pl.DataFrame):
    submission = df_submission.to_pandas()
    submission['session'] = submission.session_type.apply(lambda x: int(x.split('_')[0]))
    submission['type'] = submission.session_type.apply(lambda x: x.split('_')[1])
    submission.labels = submission.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])
    
    val_df_valid_targets = pl.read_parquet(VALIDATION_PATH / "test_labels.parquet", use_pyarrow=True)
    test_labels = val_df_valid_targets.to_pandas()

    test_labels = submission.merge(test_labels, how='left', on=['session', 'type'])
    labels_null_idx = test_labels["ground_truth"].isnull()
    test_labels["ground_truth"].loc[labels_null_idx] = (
        test_labels["ground_truth"].loc[labels_null_idx]
        .apply(lambda x: [])
    )
    test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
    test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
    
    recall_per_type = (
        test_labels.groupby(['type'])['hits'].sum() / 
        test_labels.groupby(['type'])['gt_count'].sum() 
    )

    action_weights = pd.Series({
        'clicks': 0.10, 
        'carts': 0.30, 
        'orders': 0.60
    })

    score = (recall_per_type * action_weights).sum()
    print(f"validation score: {score}")
    print(f"recall per type: {recall_per_type}")


In [25]:
# als - gpu
calc_valid_score(df_submission)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_labels["ground_truth"].loc[labels_null_idx] = (


validation score: 0.12609674616208058
recall per type: type
carts     0.107940
clicks    0.142314
orders    0.132472
dtype: float64


In [45]:
# als - gpu (sample)
calc_valid_score(df_submission)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_labels["ground_truth"].loc[labels_null_idx] = (


validation score: 0.12600046012868338
recall per type: type
carts     0.102874
clicks    0.139377
orders    0.135334
dtype: float64


In [39]:
# als - cpu (sample)
calc_valid_score(df_submission)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_labels["ground_truth"].loc[labels_null_idx] = (


validation score: 0.15625584506079204
recall per type: type
carts     0.125195
clicks    0.170358
orders    0.169436
dtype: float64


In [59]:
# bpr - cpu (sample)
calc_valid_score(df_submission)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_labels["ground_truth"].loc[labels_null_idx] = (


validation score: 0.03995772725629032
recall per type: type
carts     0.032207
clicks    0.040535
orders    0.043737
dtype: float64


In [68]:
# als - gpu (sample) - clicks only
calc_valid_score(df_submission)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_labels["ground_truth"].loc[labels_null_idx] = (


validation score: 0.05665466634203342
recall per type: type
carts     0.053701
clicks    0.076644
orders    0.054800
dtype: float64


In [76]:
# bpr - cpu (sample) - clicks only
calc_valid_score(df_submission)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_labels["ground_truth"].loc[labels_null_idx] = (


validation score: 0.03838407739846748
recall per type: type
carts     0.031678
clicks    0.040422
orders    0.041397
dtype: float64


In [85]:
# i2i_cosine - cpu (sample)
calc_valid_score(df_submission)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_labels["ground_truth"].loc[labels_null_idx] = (


validation score: 0.4979505889911498
recall per type: type
carts     0.333011
clicks    0.423629
orders    0.592807
dtype: float64
