In [1]:
%load_ext autoreload
%autoreload 2

In [14]:
from pathlib import Path
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import dill
import pandas as pd
import polars as pl

In [15]:
DATA_FOLDER = Path("/home/sirpantene/data/otto/")

TRAIN_PROCESSED = DATA_FOLDER / "train_parquet"
TRAIN_PROCESSED.mkdir(parents=True, exist_ok=True)

TEST_PROCESSED = DATA_FOLDER / "test_parquet"
TEST_PROCESSED.mkdir(parents=True, exist_ok=True)

VALIDATION_PATH = DATA_FOLDER / "validation"
VALIDATION_PATH.mkdir(parents=True, exist_ok=True)

In [3]:
df_train = pl.read_parquet(TRAIN_PROCESSED, use_pyarrow=True)
df_test = pl.read_parquet(TEST_PROCESSED, use_pyarrow=True)

In [26]:
df = pl.concat([df_train, df_test])

In [27]:
df_action_weights = pl.DataFrame({
    "type": [0, 1, 2], "weight": [10, 30, 60]
})
df_action_weights

type,weight
i64,i64
0,10
1,30
2,60


In [28]:
df = (
    df
    .join(df_action_weights, on="type", how="inner")
    .groupby(["session", "aid"])
    .agg([
        pl.sum("weight")
    ])
)

In [29]:
df

session,aid,weight
i64,i64,i64
2236993,1632955,20
8777936,1836454,10
6287846,1630411,60
10088805,427755,10
2352607,950225,10
10502601,544172,10
9827080,402362,10
5205948,1296350,120
919843,1501832,20
6109568,64259,10


In [16]:
import numpy as np
from scipy.sparse import csr_matrix


def make_sparse_matrix(df: pl.DataFrame):
    row = df["session"].to_numpy()
    col = df["aid"].to_numpy()
    weight = df["weight"].to_numpy().astype(np.float32)
    
    return csr_matrix((weight, (row, col)))

In [43]:
train_data = make_sparse_matrix(df)

In [46]:
from implicit.nearest_neighbours import CosineRecommender

i2i = CosineRecommender(K=20)
i2i.fit(train_data)



  0%|          | 0/1855603 [00:00<?, ?it/s]

shape: (1671803,)
Series: 'session' [i64]
[
	12899779
	12899780
	12899781
	12899782
	12899783
	12899784
	12899785
	12899786
	12899787
	12899788
	12899789
	12899790
	...
	14571569
	14571570
	14571571
	14571572
	14571573
	14571574
	14571575
	14571576
	14571577
	14571578
	14571579
	14571580
	14571581
]

In [70]:
from tqdm.notebook import tqdm

# make submission

submission_dict = {
    "session_type": [],
    "labels": [],
}

types = ["clicks", "carts", "orders"]
topk = 20

for test_session in tqdm(df_test["session"].unique()):
    rec_items, scores = i2i.recommend(
        test_session, user_items=train_data[test_session], N=topk,
        filter_already_liked_items=False, recalculate_user=False
    )
    session_types = [f"{test_session}_{t}" for t in types]
    labels = " ".join(str(aid) for aid in rec_items.tolist())
    labels_list = [labels] * 3
    
    submission_dict["session_type"].extend(session_types)
    submission_dict["labels"].extend(labels_list)

  0%|          | 0/1671803 [00:00<?, ?it/s]

In [73]:
df_submission = pl.DataFrame(submission_dict)

In [90]:
df_submission.write_csv("i2i_submission_already_likes.csv")

In [89]:
df_submission_old = pl.read_csv("i2i_submission.csv")
df_submission_old[6, 1]

'307090 530815 1583317 1098647 971432 1087247 1580052 660375 887589 243641 1638411 1262831 344362 1375462 499492 686451 1214650 1138514 1818836 466321'

In [66]:
df_submission = pl.DataFrame(submission_dict)
df_submission

session_type,labels
str,str
"""12899779_click...","""59625 1048547 ..."
"""12899779_carts...","""59625 1048547 ..."
"""12899779_order...","""59625 1048547 ..."
"""12899780_click...","""1305555 149304..."
"""12899780_carts...","""1305555 149304..."
"""12899780_order...","""1305555 149304..."
"""12899781_click...","""307090 530815 ..."
"""12899781_carts...","""307090 530815 ..."
"""12899781_order...","""307090 530815 ..."
"""12899782_click...","""1674956 155514..."


In [67]:
df_submission.write_csv("i2i_submission.csv")

In [69]:
!head -n 4 i2i_submission.csv

session_type,labels
12899779_clicks,59625 1048547 631502 1854910 894169 1281476 689970 372701 89335 1566282 1536875 1574761 637538 1257597 533678 1130911 410051 251302 1657590 988526
12899779_carts,59625 1048547 631502 1854910 894169 1281476 689970 372701 89335 1566282 1536875 1574761 637538 1257597 533678 1130911 410051 251302 1657590 988526
12899779_orders,59625 1048547 631502 1854910 894169 1281476 689970 372701 89335 1566282 1536875 1574761 637538 1257597 533678 1130911 410051 251302 1657590 988526


In [61]:
labels = " ".join(str(aid) for aid in rec_items.tolist())
labels_list = [labels] * 3
labels_list

['721740 54546 732172 1786340 443461 1634447 608039 1675043 1646902 675560 764161 1729059 1735041 1475231 966987 195031 1178646 924723 1721916 399565',
 '721740 54546 732172 1786340 443461 1634447 608039 1675043 1646902 675560 764161 1729059 1735041 1475231 966987 195031 1178646 924723 1721916 399565',
 '721740 54546 732172 1786340 443461 1634447 608039 1675043 1646902 675560 764161 1729059 1735041 1475231 966987 195031 1178646 924723 1721916 399565']

In [54]:
pl.read_csv(DATA_FOLDER / "sample_submission.csv")

session_type,labels
str,str
"""12899779_click...","""129004 126836 ..."
"""12899779_carts...","""129004 126836 ..."
"""12899779_order...","""129004 126836 ..."
"""12899780_click...","""129004 126836 ..."
"""12899780_carts...","""129004 126836 ..."
"""12899780_order...","""129004 126836 ..."
"""12899781_click...","""129004 126836 ..."
"""12899781_carts...","""129004 126836 ..."
"""12899781_order...","""129004 126836 ..."
"""12899782_click...","""129004 126836 ..."


In [53]:
scores

array([ 3.18585226,  3.41158093,  3.25154685,  4.93031801,  3.93303318,
        4.93031801,  3.78000176,  6.8475608 ,  5.84842444,  4.34813178,
        5.77252834,  5.33425874,  5.7430388 ,  4.91587677,  7.62893915,
        8.92217095,  7.9705282 ,  8.56730018, 11.69114577,  9.0834273 ])

# Validation

In this notebook we compute validation score for my other notebook here which submits an LB 0.575 solution. To compute validation, we just need to load parquet files from a different Kaggle dataset. Instead of loading the real train and real test data. We load the first 3 week of original train as "new train". And the last 1 week of original train as "new test". Then we train our model with "new train" and predict "new test". Finally we compute competition metric from our predictions. The data and code for validation comes from Radek here.

In [97]:
from datetime import datetime, timedelta

In [94]:
ts_valid_end = datetime(2022, 8, 28, 23, 59, 59, 999)
ts_valid_start = datetime(2022, 8, 22, 0, 0, 0, 0)

In [107]:
df_valid_sessions = (
    df_train
    .groupby("session")
    .agg([
        pl.min("ts").alias("start_ts")
    ])
    .filter(
        pl.col("start_ts")
        .cast(pl.Datetime(time_unit="ms", time_zone="Etc/GMT-2"))
        .is_between(ts_valid_start, ts_valid_end)
    )
)

df_train_sessions = (
    df_train
    .groupby("session")
    .agg([
        pl.min("ts").alias("start_ts")
    ])
    .filter(
        pl.col("start_ts")
        .cast(pl.Datetime(time_unit="ms", time_zone="Etc/GMT-2"))
        < ts_valid_start
    )
)

In [108]:
(
    df_train_sessions["session"].n_unique(),
    df_valid_sessions["session"].n_unique(),
)

(11107743, 1792036)

In [117]:
# new train and valid

val_df_train = (
    df_train
    .join(df_train_sessions.select(["session"]), on="session",how="inner")
    .filter(
        pl.col("ts")
        .cast(pl.Datetime(time_unit="ms", time_zone="Etc/GMT-2"))
        < ts_valid_start
    )
#     add filter sessions only with length >= 2
)

val_train_items = val_df_train.select([pl.col("aid")]).unique()

val_df_valid = (
    df_train
    .join(df_valid_sessions.select(["session"]), on="session",how="inner")
    .join(val_train_items, on="aid",how="inner")
)

In [121]:
# split valid for metrics purpose

val_df_valid = (
    val_df_valid
    .sort("ts")
    .with_columns([pl.lit(1).alias("ones"),])
    .with_columns([pl.col("ones").cumsum()
                   .over("session").alias("event_id"),])
    .with_columns([pl.max("event_id")
                   .over("session").alias("event_total"),])
)

In [124]:
import random


def _get_random_idx(events_num):
    return random.randint(1, events_num)


val_df_valid_session_split = (
    val_df_valid
    .groupby("session")
    .agg([
        pl.max("event_total")
    ])
    .with_columns([
        pl.col("event_total")
        .apply(lambda x: _get_random_idx(x))
        .alias("split_event_id")
    ])
)

In [151]:
val_df_valid_input = (
    val_df_valid
    .join(
        val_df_valid_session_split.select(["session", "split_event_id"]),
        on="session", how="inner"
    )
    .filter(pl.col("event_id") < pl.col("split_event_id"))
    .drop(["ones", "event_id", "event_total", "split_event_id"])
)

In [200]:
val_df_valid_targets = (
    val_df_valid
    .join(
        val_df_valid_session_split.select(["session", "split_event_id"]),
        on="session", how="inner"
    )
    .filter(pl.col("event_id") >= pl.col("split_event_id"))
    .sort(["session", "ts"])
    .groupby(["session", "type"])
    .agg([
        pl.col("ts").list().alias("ts"),
        pl.col("aid").unique().alias("uniq_aids"),
        pl.col("aid").list().alias("list_aids"),
    ])
    .with_columns([
        pl.col("list_aids").arr.head(1).alias("first_aid"),
    ])
    .with_columns([
        pl.when(pl.col("type") == 0)
        .then(pl.col("first_aid"))
        .otherwise(pl.col("uniq_aids"))
        .alias("ground_truth")
    ])
    .rename({"type": "type_id"})
    .drop(["uniq_aids", "list_aids", "first_aid"])
)

df_types_weights = pl.DataFrame({
    "type_id": [0, 1, 2], "type": ["clicks", "carts", "orders"]
})

val_df_valid_targets = (
    val_df_valid_targets
    .join(df_types_weights, on="type_id", how="inner")
    .sort(["session", "type_id"])
    .select(["session", "type", "ground_truth"])
)

In [158]:
val_df_train.head()

session,aid,ts,type
i64,i64,i64,i64
0,1517085,1659304800025,0
0,1563459,1659304904511,0
0,1309446,1659367439426,0
0,16246,1659367719997,0
0,1781822,1659367871344,0


In [155]:
val_df_valid_input.head()

session,aid,ts,type
i64,i64,i64,i64
11107743,1243310,1661126400192,0
11107744,220086,1661126401190,0
11107745,770581,1661126402749,0
11107746,1134602,1661126404790,0
11107747,1006198,1661126408858,0


In [175]:
val_df_valid_targets.head()

session,type,ground_truth
i64,str,list[i64]
11107743,"""clicks""",[224347]
11107744,"""clicks""",[1075085]
11107744,"""carts""","[1075085, 628545, ... 165989]"
11107745,"""clicks""",[549612]
11107746,"""clicks""",[1750662]


In [176]:
# save
val_df_train.write_parquet(VALIDATION_PATH / "train.parquet", use_pyarrow=True)
val_df_valid_input.write_parquet(VALIDATION_PATH / "valid.parquet", use_pyarrow=True)
val_df_valid_targets.write_parquet(VALIDATION_PATH / "test_labels.parquet", use_pyarrow=True)

## check the model 

In [17]:
# load
val_df_train = pl.read_parquet(VALIDATION_PATH / "train.parquet", use_pyarrow=True)
val_df_valid_input = pl.read_parquet(VALIDATION_PATH / "valid.parquet", use_pyarrow=True)
val_df_valid_targets = pl.read_parquet(VALIDATION_PATH / "test_labels.parquet", use_pyarrow=True)

In [18]:
df_action_weights = pl.DataFrame({
    "type": [0, 1, 2], "weight": [10, 30, 60]
})

df = pl.concat([val_df_train, val_df_valid_input])

df = (
    df
    .join(df_action_weights, on="type", how="inner")
    .groupby(["session", "aid"])
    .agg([
        pl.sum("weight")
    ])
)

In [164]:
from implicit.nearest_neighbours import CosineRecommender

i2i = CosineRecommender(K=20)
train_data = make_sparse_matrix(df)
i2i.fit(train_data)



  0%|          | 0/1855603 [00:00<?, ?it/s]

In [192]:
from tqdm.notebook import tqdm

# make submission

submission_dict = {
    "session_type": [],
    "labels": [],
}

types = ["clicks", "carts", "orders"]
topk = 20

for test_session in tqdm(val_df_valid_input["session"].unique()):
    rec_items, scores = i2i.recommend(
        test_session, user_items=train_data[test_session], N=topk,
        filter_already_liked_items=True, recalculate_user=False
    )
    session_types = [f"{test_session}_{t}" for t in types]
    labels = " ".join(str(aid) for aid in rec_items.tolist())
    labels_list = [labels] * 3
    
    submission_dict["session_type"].extend(session_types)
    submission_dict["labels"].extend(labels_list)

  0%|          | 0/1303355 [00:00<?, ?it/s]

In [223]:
df_submission = pl.DataFrame(submission_dict)

In [224]:
df_submission

session_type,labels
str,str
"""11107743_click...","""427247 1513353..."
"""11107743_carts...","""427247 1513353..."
"""11107743_order...","""427247 1513353..."
"""11107744_click...","""456690 256726 ..."
"""11107744_carts...","""456690 256726 ..."
"""11107744_order...","""456690 256726 ..."
"""11107745_click...","""561333 823143 ..."
"""11107745_carts...","""561333 823143 ..."
"""11107745_order...","""561333 823143 ..."
"""11107746_click...","""63044 1366325 ..."


In [225]:
submission = df_submission.to_pandas()

In [226]:
submission['session'] = submission.session_type.apply(lambda x: int(x.split('_')[0]))
submission['type'] = submission.session_type.apply(lambda x: x.split('_')[1])
submission.labels = submission.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])

In [227]:
test_labels = val_df_valid_targets.to_pandas()

In [228]:
test_labels

Unnamed: 0,session,type,ground_truth
0,11107743,clicks,[224347]
1,11107744,clicks,[1075085]
2,11107744,carts,"[165989, 178869, 628545, 1075085]"
3,11107745,clicks,[549612]
4,11107746,clicks,[1750662]
...,...,...,...
2215805,12899774,clicks,[33035]
2215806,12899775,clicks,[1760714]
2215807,12899776,clicks,[548599]
2215808,12899777,clicks,[384045]


In [229]:
test_labels = submission.merge(test_labels, how='left', on=['session', 'type'])
labels_null_idx = test_labels["ground_truth"].isnull()
test_labels["ground_truth"].loc[labels_null_idx] = (
    test_labels["ground_truth"].loc[labels_null_idx]
    .apply(lambda x: [])
)
test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_labels["ground_truth"].loc[labels_null_idx] = (


In [230]:
recall_per_type = (
    test_labels.groupby(['type'])['hits'].sum() / 
    test_labels.groupby(['type'])['gt_count'].sum() 
)

action_weights = pd.Series({
    'clicks': 0.10, 
    'carts': 0.30, 
    'orders': 0.60
})

score = (recall_per_type * action_weights).sum()
print(f"validation score: {score}")

validation score: 0.13044809245985098


In [231]:
recall_per_type

type
carts     0.137547
clicks    0.233838
orders    0.109667
dtype: float64

## ALS

In [23]:
from implicit.als import AlternatingLeastSquares


als = AlternatingLeastSquares(
    factors=100,
    regularization=0.01,
    alpha=1.0,
    iterations=3,
    calculate_training_loss=True,
    use_gpu=False
)

In [24]:
train_data = make_sparse_matrix(df)
als.fit(train_data)

  0%|          | 0/3 [00:00<?, ?it/s]

In [26]:
rec_items, scores = als.recommend(
    [1, 2, 3, 4, 5], user_items=train_data[[1, 2, 3, 4, 5]], N=topk,
    filter_already_liked_items=False, recalculate_user=False
)

In [34]:
test_users = val_df_valid_input["session"].unique().to_list()
b_sz = 1000

recs = []
for test_session in tqdm(range(0, len(test_users), b_sz)):
    test_sessions = test_users[test_session : test_session + b_sz]
    rec_items, scores = als.recommend(
        test_sessions, user_items=train_data[test_sessions], N=topk,
        filter_already_liked_items=False, recalculate_user=False
    )
    recs.extend(rec_items.tolist())

  0%|          | 0/1304 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [35]:
len(recs)

8000

In [25]:
from tqdm.notebook import tqdm

# make submission

submission_dict = {
    "session_type": [],
    "labels": [],
}

types = ["clicks", "carts", "orders"]
topk = 20


for test_session in tqdm():
    rec_items, scores = als.recommend(
        test_session, user_items=train_data[test_session], N=topk,
        filter_already_liked_items=False, recalculate_user=False
    )
    session_types = [f"{test_session}_{t}" for t in types]
    labels = " ".join(str(aid) for aid in rec_items.tolist())
    labels_list = [labels] * 3
    
    submission_dict["session_type"].extend(session_types)
    submission_dict["labels"].extend(labels_list)

  0%|          | 0/1303355 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
def calc_valid_score(df_submission: pl.DataFrame):
    submission = df_submission.to_pandas()
    submission['session'] = submission.session_type.apply(lambda x: int(x.split('_')[0]))
    submission['type'] = submission.session_type.apply(lambda x: x.split('_')[1])
    submission.labels = submission.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])