In [1]:
from tqdm import tqdm
from copy import deepcopy
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt

import implicit as imp

from scipy import sparse

In [2]:
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
# !pip install fastparquet
# !pip install pyarrow

In [4]:
# train упорядочен хронологически
df = pd.read_parquet('train.parquet.gzip')
df.dtypes

user_id      int32
item_id      int32
timespent     int8
reaction      int8
dtype: object

In [5]:
# в items_meta для каждого item_id его автор и эмбеддинг содержания
items = pd.read_parquet(f'items_meta.parquet.gzip')
items.head(3)

Unnamed: 0,item_id,source_id,embeddings
0,0,7340,"[0.10458118, 0.047880154, 0.030944156, -0.0351..."
1,1,6284,"[0.035625108, -0.039264094, -0.03310334, -0.04..."
2,2,12766,"[0.08418761, 0.006732465, -0.0037112322, -0.02..."


In [6]:
items = items.set_index('item_id')

In [7]:
items['mean_timespent'] = df.groupby('item_id')['timespent'].mean().values

In [8]:
items['mean_reaction'] = df.groupby('item_id')['reaction'].mean().values

In [9]:
items['amount'] = df.groupby('item_id').size()

In [10]:
items['amount'].describe().astype(int)

count    227606
mean        634
std        1959
min           3
25%          68
50%         172
75%         490
max      122986
Name: amount, dtype: int64

In [11]:
candidates = pd.read_parquet(f'fresh_candidates.parquet.gzip')
candidates.head(3)

Unnamed: 0,item_id
0,0
1,2
2,5


In [12]:
test = pd.read_parquet('test.parquet.gzip')
test.head(3)

Unnamed: 0,user_id
0,7
1,8
2,9


In [13]:
def get_embeddings(items_id):
    X = []
    for emb in items.loc[items_id].embeddings:
        v = np.array(emb)
        X.append(v)
    X = np.array(X)
    return X

In [14]:
from lightgbm import LGBMRanker

In [15]:
testset = set(test.user_id)

In [16]:
BASE = 1.4

In [17]:
POW = 1.4

In [18]:
import faiss

In [19]:
class Recommender:
    def __init__(self, top_n=20, nearest_n=1000, session_group=10, last_n=500, pop_thresh=21):
        self.top_n = top_n
        self.last_n = last_n
        self.nearest_n = nearest_n
        self.pop_thresh = pop_thresh
        self.session_group = session_group
        
    def fit(self, candidates):
        self.candidates = candidates
        cand_embds = get_embeddings(candidates.item_id.values)
        self.index = faiss.IndexFlatL2(cand_embds.shape[1])
        
        self.cand_idx_to_item = dict(zip(candidates.index.values, candidates.item_id.values))
        self.cand_item_to_idx = dict(zip(candidates.item_id.values, candidates.index.values))
        
        self.index.add(cand_embds)
        
        
    def fit_load(self, candidates, path='neighbors.csv'):
        self.NEIGHBORS = pd.read_csv(path)
        self.candidates = candidates
        self.cand_idx_to_item = dict(zip(candidates.index.values, candidates.item_id.values))
        self.cand_item_to_idx = dict(zip(candidates.item_id.values, candidates.index.values))
        
        
    def predict(self, user_id, user_df, min_ts=1000, min_size=1000):
        seen_item_ids = set(user_df.item_id.values)
        dislike_item_ids = set(user_df[user_df.reaction == -1].item_id.values)

        negative_items = list(dislike_item_ids)

        # do smth if too low
        positive_items = user_df[(user_df.timespent != 0) & (user_df.reaction != -1)].item_id.values
            
        if len(user_df) < 10:
            nearest_n = self.nearest_n + 10
        else:
            nearest_n = self.nearest_n
        
        p_nearest_n = nearest_n
        n_nearest_n = 10
        
        pos_embds = get_embeddings(positive_items)
        positive_neighbors = set(self.candidates.iloc[self.index.search(pos_embds, k=p_nearest_n)[1].flatten()].item_id.values)
        if len(negative_items):
            neg_embds = get_embeddings(negative_items)
            negative_neighbors = set(self.candidates.iloc[self.index.search(neg_embds, k=p_nearest_n)[1].flatten()].item_id.values)

#         negative_neighbors = set(self.candidates.iloc[self.NEIGHBORS.loc[negative_items].values[:, :n_nearest_n].flatten()].item_id.values)
#         positive_neighbors = set(self.candidates.iloc[self.NEIGHBORS.loc[positive_items].values[:, :p_nearest_n].flatten()].item_id.values)
        
        candidates_item_ids = positive_neighbors - seen_item_ids
        
        if len(negative_items):
            candidates_item_ids -= negative_neighbors
        
        # pop recomend
        if (user_df.timespent.mean() < min_ts) or (len(user_df) < min_size):
            pop = items[['mean_timespent']].loc[candidates_item_ids]
            return pop.sort_values(by='mean_timespent', ascending=False)[:self.top_n].index.values
            
        # rerank personal
        X = get_embeddings(user_df.item_id.values)
        y = user_df.timespent.rank().astype(int)        
        query_train = [len(y) % self.session_group] + [self.session_group for _ in range(len(y) // self.session_group)]
        
        ranker = LGBMRanker(
            metric="ndcg",
            objective="lambdarank",
            label_gain=BASE**np.arange(1, len(y)+2, dtype=float),
            random_state=42,
            n_jobs=4,
            reg_lambda=10,
        )
        
        ranker.fit(
            X, y, group=query_train,
            sample_weight=BASE**np.arange(1, len(y)+1),
        )
        
        candidates_df = items[[]].loc[candidates_item_ids]
        
        candidates_embeddings = get_embeddings(candidates_df.index)
        candidates_df['prediction'] = ranker.predict(candidates_embeddings)
              
        return candidates_df.sort_values(by='prediction', ascending=False)[:self.top_n].index.values
    

In [20]:
model = Recommender(nearest_n=1000, session_group=10)
model.fit(candidates)

In [21]:
predictions = dict()
for user_id, user_df in tqdm(df[['user_id', 'item_id', 'timespent', 'reaction']].groupby('user_id'), total=len(df.user_id.unique())):
    if user_id not in testset:
        continue
    pred = model.predict(user_id, user_df, min_ts=0.3, min_size=60)
    predictions[user_id] = pred

100%|██████████| 1000183/1000183 [5:06:08<00:00, 54.45it/s]  


In [22]:
test_predictions = []
for user_id in test.user_id:
    test_predictions.append(predictions[user_id])
test['predictions'] = test_predictions

test.head(3)

Unnamed: 0,user_id,predictions
0,7,"[44269, 58977, 227420, 130122, 24553, 142350, ..."
1,8,"[37449, 12504, 31620, 108607, 211427, 149924, ..."
2,9,"[125635, 198837, 167167, 99502, 151791, 62582,..."


In [23]:
test.to_parquet('sample_submission_last_chance.parquet.gzip', compression='gzip', engine='pyarrow')
test.to_parquet('sample_submission.parquet.gzip', compression='gzip', engine='pyarrow')

# Debug

In [1049]:
X = get_embeddings(user_df.item_id.values)
# y = user_df.timespent.values
y = user_df.timespent.rank().astype(int)

In [1146]:
ranker = LGBMRanker(
    metric="ndcg",
    n_estimators=30,
    objective="lambdarank",
    label_gain=(np.arange(1, len(y)+2))
)

In [1147]:
query_train = [len(y)]

In [1149]:
N = 20

In [1150]:
query_train = [len(y) % N] + [N for _ in range(len(y) // N)]

In [1151]:
user_df['y'] = user_df.timespent.rank().astype(int) 

In [1152]:
ranker.fit(X, y, group=query_train,)

LGBMRanker(label_gain=array([2.71828183e+000, 7.38905610e+000, 2.00855369e+001, 5.45981500e+001,
       1.48413159e+002, 4.03428793e+002, 1.09663316e+003, 2.98095799e+003,
       8.10308393e+003, 2.20264658e+004, 5.98741417e+004, 1.62754791e+005,
       4.42413392e+005, 1.20260428e+006, 3.26901737e+006, 8.88611052e+006,
       2.41549528e+007, 6.56599691e+007, 1.78482301e+008, 4.85165195e+008,
       1...
       3.29219761e+158, 8.94912093e+158, 2.43262328e+159, 6.61255566e+159,
       1.79747899e+160, 4.88605447e+160, 1.32816731e+161, 3.61033306e+161,
       9.81390275e+161, 2.66769535e+162, 7.25154779e+162, 1.97117506e+163,
       5.35820935e+163, 1.45651231e+164, 3.95921094e+164, 1.07622512e+165,
       2.92548318e+165, 7.95228776e+165, 2.16165593e+166, 5.87599004e+166,
       1.59725969e+167, 4.34180200e+167]),
           metric='ndcg', n_estimators=30, objective='lambdarank')

In [1153]:
user_df['predict'] = ranker.predict(X)

In [1154]:
user_df.sort_values(by='predict', ascending=False).head(20)

Unnamed: 0,user_id,item_id,timespent,y,predict
32608332,687,220136,22,384,3.148048
113000825,687,208951,17,381,3.066361
41428178,687,69719,19,383,2.891415
129934101,687,55078,5,363,2.887321
51511521,687,107249,9,374,2.88485
96601340,687,148589,8,372,2.874132
53068956,687,38823,58,385,2.770502
13999550,687,212337,8,372,2.710167
60642717,687,166776,10,376,2.690265
68674000,687,221911,8,372,2.648515


In [1143]:
user_df.sort_values(by='timespent', ascending=False)

Unnamed: 0,user_id,item_id,timespent,y,predict
53068956,687,38823,58,385,0.890033
32608332,687,220136,22,384,-0.304008
41428178,687,69719,19,383,-0.378559
40596528,687,153924,18,382,3.014997
113000825,687,208951,17,381,-0.948236
...,...,...,...,...,...
57983424,687,194910,0,158,-2.205393
54204498,687,7090,0,158,-2.340311
52962923,687,91352,0,158,-2.521262
52962921,687,179317,0,158,-2.357891


In [1115]:
user_df

Unnamed: 0,user_id,item_id,timespent,y
440592,687,4477,0,158
440593,687,25558,0,158
441575,687,159003,0,158
513592,687,8426,0,158
562894,687,18832,0,158
...,...,...,...,...
142180160,687,119702,0,158
142180161,687,186901,0,158
142180162,687,104811,0,158
142180163,687,26993,0,158


# ==============================================