In [1]:
from tqdm import tqdm
from copy import deepcopy
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt

import implicit
from scipy.sparse import coo_matrix
from implicit.evaluation import mean_average_precision_at_k
from sklearn.model_selection import train_test_split

In [2]:
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
# !pip install fastparquet
# !pip install pyarrow

In [4]:
# train упорядочен хронологически
df = pd.read_parquet('train.parquet.gzip')
df.dtypes

user_id      int32
item_id      int32
timespent     int8
reaction      int8
dtype: object

In [5]:
# в items_meta для каждого item_id его автор и эмбеддинг содержания
items = pd.read_parquet(f'items_meta.parquet.gzip')
items.head(3)

Unnamed: 0,item_id,source_id,embeddings
0,0,7340,"[0.10458118, 0.047880154, 0.030944156, -0.0351..."
1,1,6284,"[0.035625108, -0.039264094, -0.03310334, -0.04..."
2,2,12766,"[0.08418761, 0.006732465, -0.0037112322, -0.02..."


In [6]:
items = items.set_index('item_id')

In [7]:
# items['mean_timespent'] = df.groupby('item_id')['timespent'].mean().values

In [8]:
# items['mean_reaction'] = df.groupby('item_id')['reaction'].mean().values

In [9]:
# items['amount'] = df.groupby('item_id').size()

In [10]:
# np.corrcoef(items['amount'], items['mean_timespent'])

In [11]:
# np.corrcoef(items['amount'], items['mean_reaction'])

In [12]:
# np.corrcoef(items['mean_timespent'], items['mean_reaction'])

In [13]:
candidates = pd.read_parquet(f'fresh_candidates.parquet.gzip')
candidates.head(3)

Unnamed: 0,item_id
0,0
1,2
2,5


# ALS

In [14]:
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import precision_at_k, ndcg_at_k

In [15]:
ALL_USERS = df['user_id'].unique().tolist()
ALL_ITEMS = df['item_id'].unique().tolist()

In [16]:
user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

In [17]:
cand_item_ids = candidates['item_id'].map(item_map).values

In [18]:
df['user_id'] = df['user_id'].map(user_map)
df['item_id'] = df['item_id'].map(item_map)

In [20]:
timerank = []
user_time = dict(zip(ALL_USERS, np.ones_like(ALL_USERS)))
for i in tqdm(df.user_id.values):
    timerank.append(user_time[i])
    user_time[i] += 1

100%|███████████████████████████████████████| 144440015/144440015 [01:57<00:00, 1231706.68it/s]


In [34]:
inverse_timerank = []
for i in tqdm(df.user_id.values):
    user_time[i] -= 1
    inverse_timerank.append(user_time[i])

100%|███████████████████████████████████████| 144440015/144440015 [01:43<00:00, 1394111.32it/s]


In [38]:
timerank = np.array(timerank)
inverse_timerank = np.array(inverse_timerank)

In [284]:
def f_timerank(timerank, power=2, bias=0):
    return 1 / ((timerank + bias) ** power)

In [126]:
# timerank = np.ones_like(df['timespent'].values)

In [354]:
alpha = 30
power = 1
bias = 1

In [355]:
df['value'] = 1 + df['timespent'].values * f_timerank(inverse_timerank, power=power, bias=bias) * alpha

In [356]:
def to_user_item_coo(df):
    """ Turn a dataframe with transactions into a COO sparse items x users matrix"""
    row = df['user_id'].values
    col = df['item_id'].values
    data = df['value'].values
    coo = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
    return coo

In [357]:
coo_train = to_user_item_coo(df)

In [358]:
N = 20

In [359]:
def split_data(df):
    df_train, df_val = train_test_split(df, test_size=0.3)
    return df_train, df_val

In [360]:
def get_val_matrices(df):
    """ Split into training and validation and create various matrices
        
        Returns a dictionary with the following keys:
            coo_train: training data in COO sparse format and as (users x items)
            csr_train: training data in CSR sparse format and as (users x items)
            csr_val:  validation data in CSR sparse format and as (users x items)
    
    """
    df_train, df_val = split_data(df)
    coo_train = to_user_item_coo(df_train)
    coo_val = to_user_item_coo(df_val)

    csr_train = coo_train.tocsr()
    csr_val = coo_val.tocsr()
    
    return {'coo_train': coo_train.tocsr(),
            'csr_train': csr_train.tocsr(),
            'csr_val': csr_val.tocsr(),
          }

In [361]:
def validate(matrices, factors=200, iterations=20, regularization=0.01, show_progress=True):
    """ Train an ALS model with <<factors>> (embeddings dimension) 
    for <<iterations>> over matrices and validate with ndcg@20
    """
    coo_train, csr_train, csr_val = matrices['coo_train'], matrices['csr_train'], matrices['csr_val']
    
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(coo_train, show_progress=show_progress)
    
    # The MAPK by implicit doesn't allow to calculate allowing repeated items, which is the case.
    metric = ndcg_at_k(model, csr_train, csr_val, K=20, show_progress=show_progress, num_threads=4)
    print(f"Factors: {factors:>3} - Iterations: {iterations:>2} - Regularization: {regularization:4.3f} ==> NDGC@20: {metric:6.5f}")
    return metric

In [349]:
matrices = get_val_matrices(df)

In [350]:
# %%time
# best_ndcg20 = 0
# '''
# Limited by the memory usage of this notebook, 
# the maximum factors we can have is 100
# should run for factors of 200, 500, 1000 if possible
# '''
# for factors in [384, 512, 784, 1024]:
#     for iterations in [5,]:
#         for regularization in [0.01, 0.1]:
#             ndcg20 = validate(matrices, factors, iterations, regularization, show_progress=False)
#             if ndcg20 > best_ndcg20:
#                 best_ndcg20 = ndcg20
#                 best_params = {'factors': factors, 'iterations': iterations, 'regularization': regularization}
#                 print(f"Best ndcg@20 found. Updating: {best_params}")

In [351]:
%%time

validate(matrices, 512, 5, 0.1, show_progress=False)

Factors: 512 - Iterations:  5 - Regularization: 0.100 ==> NDGC@20: 0.25262
CPU times: user 47.3 s, sys: 1.5 s, total: 48.8 s
Wall time: 49.5 s


0.2526196623169779

In [352]:
%%time

validate(matrices, 1024, 5, 0.1, show_progress=False)

KeyboardInterrupt: 

In [228]:
# 0.2875 alpha 20 1/x

In [327]:
%%time

validate(matrices, 1024, 5, 0.01, show_progress=False)

Factors: 1024 - Iterations:  5 - Regularization: 0.010 ==> NDGC@20: 0.28026
CPU times: user 1min 40s, sys: 1.74 s, total: 1min 42s
Wall time: 1min 42s


0.280257948900665

In [328]:
%%time

validate(matrices, 1024, 5, 1, show_progress=False)

Factors: 1024 - Iterations:  5 - Regularization: 1.000 ==> NDGC@20: 0.28582
CPU times: user 1min 40s, sys: 1.73 s, total: 1min 42s
Wall time: 1min 43s


0.28582427187522075

In [329]:
%%time

validate(matrices, 1024, 5, 2, show_progress=False)

Factors: 1024 - Iterations:  5 - Regularization: 2.000 ==> NDGC@20: 0.28585
CPU times: user 1min 40s, sys: 1.8 s, total: 1min 42s
Wall time: 1min 42s


0.2858545435552131

In [330]:
%%time

validate(matrices, 1024, 10, 2, show_progress=False)

Factors: 1024 - Iterations: 10 - Regularization: 2.000 ==> NDGC@20: 0.28767
CPU times: user 2min 45s, sys: 2.22 s, total: 2min 47s
Wall time: 2min 48s


0.2876717607339115

In [331]:
%%time

validate(matrices, 1024, 20, 2, show_progress=False)

Factors: 1024 - Iterations: 20 - Regularization: 2.000 ==> NDGC@20: 0.28661
CPU times: user 4min 52s, sys: 2.37 s, total: 4min 54s
Wall time: 4min 55s


0.2866098821318276

In [None]:
# 0.2899

In [108]:
%%time

validate(matrices, 1024, 7, 0.1, show_progress=False)

Factors: 1024 - Iterations:  7 - Regularization: 0.100 ==> NDGC@20: 0.28479
CPU times: user 2min 5s, sys: 1.71 s, total: 2min 6s
Wall time: 2min 7s


0.2847927109311539

In [None]:
# 0.288

In [363]:
coo_train = to_user_item_coo(df)
csr_train = coo_train.tocsr()

In [364]:
def train(coo_train, factors=100, iterations=15, regularization=0.01, show_progress=True):
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(coo_train, show_progress=show_progress)
    return model

In [365]:
best_params = {'factors': 1024, 'iterations': 7, 'regularization': 0.5}

In [366]:
%%time

model = train(coo_train, **best_params)



  0%|          | 0/7 [00:00<?, ?it/s]

CPU times: user 2min 6s, sys: 1.79 s, total: 2min 8s
Wall time: 2min 8s


In [367]:
cand_item_ids = candidates['item_id'].map(item_map).values

In [368]:
test = pd.read_parquet('test.parquet.gzip')
test.head(3)

Unnamed: 0,user_id
0,7
1,8
2,9


In [369]:
test_users = test.user_id.map(user_map)

In [370]:
raw_recs, scores = model.recommend(
        test_users, csr_train[test_users], N=N,
        filter_already_liked_items=True, recalculate_user=False,
        items=cand_item_ids,
    )

In [371]:
recs = []
for r in raw_recs:
    recs.append([item_ids[i] for i in r])

In [372]:
test['predictions'] = recs

In [373]:
test.to_parquet('sample_submission_LAST.parquet.gzip', compression='gzip', engine='pyarrow')