In [1]:
from tqdm import tqdm
from copy import deepcopy
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt

import implicit as imp

from scipy import sparse

In [2]:
# !pip install fastparquet
# !pip install pyarrow

In [3]:
TIMESPENT_COEF = 1
REACTION_COEF = 10

In [4]:
# train упорядочен хронологически
df = pd.read_parquet('train.parquet.gzip')
df.dtypes

user_id      int32
item_id      int32
timespent     int8
reaction      int8
dtype: object

In [188]:
df

Unnamed: 0,user_id,item_id,timespent,reaction
0,707536,67950,0,0
1,707536,151002,0,0
2,707536,134736,0,0
3,707536,196151,0,0
4,707536,94182,0,0
...,...,...,...,...
144440010,849764,80910,0,0
144440011,993316,132328,0,0
144440012,993316,186701,0,0
144440013,666981,81857,0,0


In [5]:
# в items_meta для каждого item_id его автор и эмбеддинг содержания
items = pd.read_parquet(f'items_meta.parquet.gzip')
items.head(3)

Unnamed: 0,item_id,source_id,embeddings
0,0,7340,"[0.10458118, 0.047880154, 0.030944156, -0.0351..."
1,1,6284,"[0.035625108, -0.039264094, -0.03310334, -0.04..."
2,2,12766,"[0.08418761, 0.006732465, -0.0037112322, -0.02..."


In [6]:
items = items.set_index('item_id')

In [7]:
items['mean_timespent'] = df.groupby('item_id')['timespent'].mean().values

In [8]:
items['mean_reaction'] = df.groupby('item_id')['reaction'].mean().values

In [9]:
items['amount'] = df.groupby('item_id').size()

In [10]:
items['amount'].describe().astype(int)

count    227606
mean        634
std        1959
min           3
25%          68
50%         172
75%         490
max      122986
Name: amount, dtype: int64

In [11]:
np.corrcoef(items['amount'], items['mean_timespent'])

array([[1.       , 0.0991944],
       [0.0991944, 1.       ]])

In [12]:
np.corrcoef(items['amount'], items['mean_reaction'])

array([[ 1.        , -0.06108216],
       [-0.06108216,  1.        ]])

In [13]:
np.corrcoef(items['mean_timespent'], items['mean_reaction'])

array([[1.        , 0.28293847],
       [0.28293847, 1.        ]])

In [14]:
candidates = pd.read_parquet(f'fresh_candidates.parquet.gzip')
candidates.head(3)

Unnamed: 0,item_id
0,0
1,2
2,5


In [15]:
def get_embeddings(items_id):
    X = []
    for emb in items.loc[items_id].embeddings:
        v = np.array(emb)
        X.append(v)
    X = np.array(X)
    return X

# Candidates selection

In [16]:
from sklearn.neighbors import NearestNeighbors

In [18]:
n_neighbors = 100

In [19]:
knn = NearestNeighbors(metric='cosine', n_neighbors=n_neighbors)

In [20]:
fresh_embeddings = get_embeddings(candidates.item_id.values)

In [21]:
knn.fit(fresh_embeddings)

NearestNeighbors(metric='cosine', n_neighbors=100)

In [22]:
all_embeddings = get_embeddings(items.index)

### closest fresh candidates to old embds

# Fit

In [36]:
# %%time

# DIST, NEIGHBORS = knn.kneighbors(all_embeddings)

In [34]:
NEIGHBORS = pd.read_csv('neighbors.csv')

# Check

In [37]:
user_id = np.random.choice(df.user_id)

In [38]:
user_df = df[df.user_id == user_id]

In [39]:
len(user_df)

219

In [40]:
user_df.timespent.mean()

0.1872146118721461

In [47]:
NEIGHBORS.loc[user_df.item_id].values.flatten()

array([26618, 71399,  4498, ..., 99094,  3059, 51336])

In [63]:
recs = items.loc[candidates.loc[NEIGHBORS.loc[user_df.item_id].values.flatten()].item_id.values].index.values

In [55]:
recs = np.random.choice(candidates.item_id.values, len(recs))

In [56]:
len(set(recs))

19686

In [57]:
len(user_df.item_id.values)

219

In [58]:
len(set(recs) & set(user_df.item_id.values))

13

In [59]:
len(set(recs) & set(user_df.item_id.values)) / len(user_df.item_id.values)

0.0593607305936073

# Predict

In [68]:
test = pd.read_parquet('test.parquet.gzip')
test.head(3)

Unnamed: 0,user_id
0,7
1,8
2,9


In [69]:
testset = set(test.user_id)

In [83]:
candidates.loc[0]

item_id    0
Name: 0, dtype: int32

In [85]:
candidates.head(4)

Unnamed: 0,item_id
0,0
1,2
2,5
3,6


In [95]:
candidates_item_ids = candidates.iloc[NEIGHBORS.loc[user_df.item_id].values.flatten()].item_id.values

In [96]:
candidates_item_ids

array([190412, 189861, 179615, ...,   8663, 133371, 139276], dtype=int32)

In [104]:
candidates_item_ids

array([190412, 189861, 179615, ...,   8663, 133371, 139276], dtype=int32)

In [107]:
recs

Unnamed: 0_level_0,mean_timespent
item_id,Unnamed: 1_level_1
221189,0.518135
81928,0.084142
16393,0.952381
131081,3.116319
81930,0.287879
...,...
16373,0.837209
32759,1.761628
40952,0.019868
196605,0.947368


In [111]:
candidates_item_ids = set(candidates.iloc[NEIGHBORS.loc[user_df.item_id].values.flatten()].item_id.values)
seen_item_ids = set(user_df.item_id.values)
candidates_item_ids -= seen_item_ids

In [116]:
recs = items['mean_timespent'].loc[candidates_item_ids]

In [117]:
recs

item_id
221189    0.518135
81928     0.084142
16393     0.952381
131081    3.116319
81930     0.287879
            ...   
16373     0.837209
32759     1.761628
40952     0.019868
196605    0.947368
65535     0.547862
Name: mean_timespent, Length: 2454, dtype: float64

In [178]:
user_id_size = df.groupby('user_id').size()

In [185]:
user_id_nonzero_size = df[df.timespent != 0].groupby('user_id').size()

In [179]:
user_id_size.mean()

144.41358731352162

In [183]:
user_id_size.describe().round(3)

count    1000183.000
mean         144.414
std          124.067
min            6.000
25%           53.000
50%          104.000
75%          198.000
max         1018.000
dtype: float64

In [187]:
len(df[df.timespent != 0]) / len(df)

0.16324749066247327

In [186]:
user_id_nonzero_size.describe().round(3)

count    1000183.000
mean          23.575
std           22.668
min            5.000
25%            9.000
50%           15.000
75%           30.000
max          220.000
dtype: float64

In [120]:
recs.sort_values().index[-10:]

Int64Index([144302, 117235, 179708, 177232, 89254, 175236, 131376, 50439,
            36149, 50389],
           dtype='int64', name='item_id')

In [189]:
items

Unnamed: 0_level_0,source_id,embeddings,mean_timespent,mean_reaction,amount
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,7340,"[0.10458118, 0.047880154, 0.030944156, -0.0351...",0.666667,0.133333,15
1,6284,"[0.035625108, -0.039264094, -0.03310334, -0.04...",0.398168,0.002819,1419
2,12766,"[0.08418761, 0.006732465, -0.0037112322, -0.02...",1.578616,0.081761,159
3,14734,"[0.049901545, 0.039079394, -0.03890682, -0.053...",0.300000,0.000000,230
4,22557,"[0.09303163, 0.023448057, 0.0029488814, -0.017...",0.214350,0.000897,1115
...,...,...,...,...,...
227601,19043,"[0.06742832, -0.08209568, -0.04407321, 0.00838...",0.183784,0.010811,185
227602,9384,"[0.07055114, -0.007334651, -0.0032477665, 0.00...",0.125000,0.010417,96
227603,24152,"[0.13771634, 0.023559753, 0.012204557, -0.0361...",0.160000,0.080000,25
227604,20249,"[0.04954276, -0.00674311, -0.040121585, -0.024...",0.379310,0.000000,29


In [222]:
top_n = 20

In [230]:
last_n = 20

In [200]:
nearest_n = 10
# 3 < 10 < 20

In [214]:
AMOUNT_W = 5
TIMESPENT_W = 10
REACTION_W = 30

items['weight'] =  items['amount'] * AMOUNT_W + items['mean_timespent'] * TIMESPENT_W + items['mean_reaction'] * REACTION_W

In [224]:
drop_zero_timespent = True

In [225]:
drop_dislike = False

In [227]:
predictions = []
for user_id, user_df in tqdm(df[['user_id', 'item_id', 'timespent']].groupby('user_id'), total=len(df.user_id.unique())):
    if user_id in testset:
        if drop_zero_timespent:
            user_df = user_df[user_df.timespent != 0]
        if drop_dislike:
            user_df = user_df[user_df.reaction != -1]
        
        candidates_item_ids = set(candidates.iloc[NEIGHBORS.loc[user_df.item_id.values[-last_n:]].values[:, :nearest_n].flatten()].item_id.values)
        seen_item_ids = set(user_df.item_id.values)
        candidates_item_ids -= seen_item_ids
        
        recs = items['mean_timespent'].loc[candidates_item_ids]
        
        pred = recs.sort_values().index[-top_n:]
        predictions.append((user_id, pred))

100%|██████████| 1000183/1000183 [03:28<00:00, 4792.93it/s]


In [228]:
map_test = dict(predictions)

test_predictions = []
for user_id in test.user_id:
    test_predictions.append(map_test[user_id].values)
test['predictions'] = test_predictions

test.head(3)

Unnamed: 0,user_id,predictions
0,7,"[9378, 113971, 6163, 57410, 9577, 215840, 4570..."
1,8,"[29877, 21253, 98378, 142183, 61323, 209588, 3..."
2,9,"[57065, 9334, 115786, 40158, 99613, 215720, 21..."


In [198]:
test.to_parquet('sample_submission_knn_last_20_top_10_account_nonzero_time_weighted.parquet.gzip', compression='gzip', engine='pyarrow')

In [229]:
test.to_parquet('sample_submission.parquet.gzip', compression='gzip', engine='pyarrow')

# ==============================================

# Pipeline

In [20]:
VAL_SIZE = 0.2

In [15]:
VAL_N = int(len(df) * VAL_SIZE)

In [16]:
val = df[-VAL_N:].copy()

In [17]:
train = df[:-VAL_N]

In [14]:
user_id = 707536

In [15]:
def get_embeddings(items_id):
    X = []
    for emb in items.loc[items_id].embeddings:
        v = np.array(emb)
        X.append(v)
    X = np.array(X)
    return X

In [16]:
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors

In [18]:
n_neighbors = 100

In [19]:
knn = NearestNeighbors(metric='cosine', n_neighbors=n_neighbors)

In [22]:
fresh_embeddings = get_embeddings(candidates.item_id.values)

In [23]:
fresh_embeddings.shape

(100000, 312)

In [33]:
knn.fit(fresh_embeddings)

NearestNeighbors(metric='cosine', n_neighbors=100)

In [26]:
all_embeddings = get_embeddings(items.index)

# predict indecies in fresh, not item_id!!!!

In [39]:
candidates.loc[neighbors.flatten()].head(3)

Unnamed: 0,item_id
25529,58267
58754,134359
55917,127779


# Now predict on all history

In [40]:
all_embeddings = get_embeddings(items.index)

In [41]:
all_embeddings.shape

(227606, 312)

In [43]:
# %%time

# dist, neighbors = knn.kneighbors(all_embeddings)

CPU times: user 12min 50s, sys: 4min 20s, total: 17min 10s
Wall time: 6min 8s


In [44]:
neighbors.shape

(227606, 100)

In [333]:
user_id = np.random.choice(df.user_id)

In [334]:
user_df = df[df.user_id == user_id]

In [335]:
len(user_df)

592

In [336]:
user_df.timespent.mean()

0.3277027027027027

In [337]:
user_embs = get_embeddings(user_df.item_id.values)

In [338]:
user_embs.shape

(592, 312)

In [353]:
dist, neighbors = knn.kneighbors(user_embs, n_neighbors=1000)

In [354]:
recs = candidates.loc[neighbors.flatten()].item_id.values

In [355]:
# recs = np.random.choice(candidates.item_id.values, len(recs))

In [356]:
len(set(recs))

64799

In [357]:
len(user_df.item_id.values)

592

In [360]:
len(set(recs) & set(user_df.item_id.values))

185

In [359]:
len(set(recs) & set(user_df.item_id.values)) / len(user_df.item_id.values)

0.3125

In [278]:
from lightgbm import LGBMRanker, LGBMRegressor
from xgboost import XGBRanker

In [279]:
regressor = LGBMRegressor()

In [276]:
ranker = XGBRanker()

In [392]:
ranker = LGBMRanker(silent=False)

In [414]:
group_n = 30
group = [group_n for _ in range(len(user_df) // group_n)] + [len(user_df)%group_n]

In [415]:
ranker.fit(user_embs, user_df.timespent, group=group)



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 61711
[LightGBM] [Info] Number of data points in the train set: 592, number of used features: 312


LGBMRanker(silent=False)

In [416]:
pred = ranker.predict(user_embs)

In [417]:
pred.shape

(592,)

In [418]:
user_df.shape

(592, 4)

In [423]:
sorted(pred)

[-9.18338826587186,
 -9.16716032257506,
 -8.997360510543409,
 -8.90529202482047,
 -8.80591539553615,
 -8.673408618002442,
 -8.654167586853792,
 -8.63299198096862,
 -8.592220843783986,
 -8.533238692907695,
 -8.460059935904695,
 -8.433673419380462,
 -8.367221043164465,
 -8.32693165214435,
 -8.320741169116793,
 -8.304365745461256,
 -8.30112539158349,
 -8.286103417295344,
 -8.275394635629349,
 -8.267864179371584,
 -8.258271203107725,
 -8.208658037759395,
 -8.171377876215386,
 -8.158136343076466,
 -8.111469057110464,
 -8.085309121437975,
 -8.078384543185358,
 -8.069617458044089,
 -7.974209980682069,
 -7.964499549220641,
 -7.934997607128736,
 -7.913068731755915,
 -7.912947768241971,
 -7.906001420133941,
 -7.896460655277106,
 -7.881983764588462,
 -7.848102328849781,
 -7.8466668474177785,
 -7.8464830615765635,
 -7.823569633794056,
 -7.822765228683978,
 -7.816367164305387,
 -7.762757846700294,
 -7.758760849883678,
 -7.7460488371912355,
 -7.744260682410484,
 -7.723186514250608,
 -7.7217399866584

In [419]:
print(user_df.iloc[sorted(pred)].timespent.values)

[6 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [422]:
user_df.timespent.sort_values()

1835874     0
38409318    0
38409317    0
38259171    0
38259170    0
           ..
93859465    6
50500873    6
83455852    7
23454258    8
2520703     8
Name: timespent, Length: 592, dtype: int8

In [420]:
sorted(user_df.timespent.values, reverse=1)

[8,
 8,
 7,
 6,
 6,
 6,
 5,
 5,
 5,
 5,
 5,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 3,
 3,
 3,
 3,
 3,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [377]:
user_df.timespent.sort_values(ascending=False)

2520703     8
23454258    8
83455852    7
50500873    6
93859465    6
           ..
38259170    0
38259171    0
38409317    0
38409318    0
1835874     0
Name: timespent, Length: 592, dtype: int8

In [281]:
user_embeddings

array([[ 0.02768344, -0.02168952, -0.03133417, ..., -0.03015773,
        -0.01166234, -0.01695309],
       [ 0.00384737, -0.02367289, -0.09630792, ...,  0.05162415,
         0.04562232, -0.04742779],
       [ 0.0565061 ,  0.07324602, -0.01643917, ...,  0.03429513,
         0.03288537, -0.06232993],
       ...,
       [ 0.00449147,  0.04834878, -0.03990331, ..., -0.00596084,
        -0.04587894, -0.07931127],
       [ 0.03671985, -0.02965335,  0.00051705, ..., -0.08089783,
        -0.04609144, -0.03568521],
       [ 0.10018815, -0.00553575, -0.01395857, ...,  0.03861998,
         0.01662912, -0.07119339]], dtype=float32)

In [79]:
test = pd.read_parquet('test.parquet.gzip')
test.head(3)

Unnamed: 0,user_id
0,7
1,8
2,9


In [80]:
for user_id in tqdm(test.user_id):
    recs = candidates.loc[neighbors[user_df.item_id.values].flatten()].item_id.values

  6%|▌         | 11447/200000 [00:08<02:25, 1291.57it/s]


KeyboardInterrupt: 

In [26]:
user_id = np.random.choice(df.user_id.values)

In [27]:
user_vecs = get_embeddings(df[df.user_id == user_id].item_id.values)

In [28]:
target_items = df[df.user_id == user_id].item_id.values

In [30]:
len(target_items)

412

In [31]:
# vec = X[0]

In [32]:
dist, neighbors = knn.kneighbors(user_vecs)

In [33]:
neighbors.shape

(412, 3)

In [34]:
len(set(neighbors.flatten())) / len(list(neighbors.flatten()))

0.8932038834951457

In [35]:
len(set(neighbors.flatten()) & set(target_items)) / len(target_items)

1.0

In [38]:
len(set(neighbors.flatten()) & set(target_items)) / len(items)

0.0018101456024885109

In [672]:
class KNNRecommender:
    def __init__(self, top_n=20, n_neighbors=100, last_n=300, metric='cosine', n_jobs=-1):
        self.top_n = top_n
        self.last_n = last_n
        self.knn = NearestNeighbors(n_neighbors=n_neighbors, metric=metric, n_jobs=n_jobs)
        
    def fit(self, candidates):
        self.embeddings = get_embeddings(candidates.item_id)
        self.knn.fit(self.embeddings)
        self.candidates = candidates
        self.cand_idx_to_item = dict(zip(candidates.index.values, candidates.item_id.values))
        self.cand_item_to_idx = dict(zip(candidates.item_id.values, candidates.index.values))
    
    def predict(self, user_id, user_df):
        # get user history account last_n
        user_embeddings = get_embeddings(user_df.item_id.values[:self.last_n])
        
        # get neighbors from fresh for user history
        dist, neighbors = self.knn.kneighbors(user_embeddings)
        
        return dist, neighbors
#         # naive recs extraction, item based filtering
#         recs = set([self.cand_idx_to_item[i] for i in neighbors.flatten()])
        
#         seen_item_ids = set(user_df.item_id.values)
        
#         # remove from recs seen topics
#         recs -= seen_item_ids
                
#         candidates = items['mean_timespent'].loc[recs]

#         # naive rerank
#         return candidates.sort_values().index.values[-self.top_n:]

In [673]:
test = pd.read_parquet('test.parquet.gzip')
test.head(3)

Unnamed: 0,user_id
0,7
1,8
2,9


In [675]:
model = KNNRecommender(top_n=20, n_neighbors=100, last_n=50)
model.fit(candidates)

In [676]:
dist, neighbors = model.predict(user_id, user_df)

In [678]:
dist.shape

(50, 100)

In [None]:
# last 50 embds calc 1.30 min

In [644]:
self = KNNRecommender(top_n=20, n_neighbors=100, last_n=50, n_jobs=1)
self.fit(candidates)

In [652]:
df2 = df.copy()

In [653]:
df2 = df2.set_index('user_id')

In [654]:
df2

Unnamed: 0_level_0,item_id,timespent,reaction
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
707536,67950,0,0
707536,151002,0,0
707536,134736,0,0
707536,196151,0,0
707536,94182,0,0
...,...,...,...
849764,80910,0,0
993316,132328,0,0
993316,186701,0,0
666981,81857,0,0


In [663]:
df.groupby(level=0)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x201ee4510>

In [667]:
predictions = []
for user_id, subdf in tqdm(df[['user_id', 'item_id']].groupby('user_id'), total=len(df.user_id.unique())):
    if user_id in testset:
        pred = model.predict(user_id, subdf)
        predictions.append((user_id, pred))

  3%|▎         | 25384/1000183 [14:03<8:59:58, 30.09it/s] 


KeyboardInterrupt: 

In [None]:
map_test = dict(predictions)

test_predictions = []
for user_id in test.user_id:
    test_predictions.append(map_test[user_id].values)
test['predictions'] = test_predictions

test.head(3)

In [None]:
test.to_parquet('sample_submission_knn.parquet.gzip', compression='gzip', engine='pyarrow')

In [618]:
user_df = df[df.user_id == user_id]

In [619]:
pred = model.predict(user_id, user_df)

In [447]:
cand_idx_to_item = dict(zip(candidates.index.values, candidates.item_id.values))

In [450]:
cand_item_to_idx = dict(zip(candidates.item_id.values, candidates.index.values))

In [409]:
candidates['item_idx'] = candidates['item_id']

In [410]:
candidates = candidates.set_index('item_idx')

In [257]:
model = KNNRecommender(top_n=20, n_items_count=300)
model.fit(candidates)

In [279]:
self = KNNRecommender(top_n=20, n_items_count=300)
self.fit(candidates)

In [522]:
knn = NearestNeighbors(metric='cosine')

In [523]:
X = get_embeddings(candidates.index)

In [524]:
all_embeddings = get_embeddings(items.index)

In [525]:
fresh_embeddings = get_embeddings(candidates.index)

In [528]:
knn.fit(fresh_embeddings)

NearestNeighbors(metric='cosine')

In [474]:
# iloc -> numeric
# loc -> item_id

In [530]:
user_df = df[df.user_id == user_id]

In [531]:
user_df = user_df.set_index('item_id')

In [532]:
user_embeddings = get_embeddings(user_df.index)

In [544]:
neig_dist, neig_ind = knn.kneighbors(user_embeddings, n_neighbors=10)

In [545]:
neig_ind.shape

(151, 10)

In [546]:
recs = []
for i in neig_ind.flatten():
    recs.append(cand_idx_to_item[i])

In [547]:
len(recs)

1510

In [548]:
len(set(recs))

1157

In [549]:
history = user_df.index.values

In [550]:
len(history)

151

In [551]:
match = set(history) & set(recs)

In [552]:
possible_match = set(history) & set(candidates.item_id.values)

In [553]:
len(possible_match)

86

In [580]:
last_n = 2

In [581]:
for n in [25, 30, 50, 70, 100, 200, 300, 500, 1000, 2000, 3000]:
    print(f'n: {n}')
    
    neig_dist, neig_ind = knn.kneighbors(user_embeddings[-last_n:], n_neighbors=n)
    recs = []
    for i in neig_ind.flatten():
        recs.append(cand_idx_to_item[i])
    
    match = set(history[-last_n:]) & set(recs)
    possible_match = set(history[-last_n:]) & set(candidates.item_id.values)
    
    print(f'unique/all: {len(set(recs)) / len(recs)}')
    print(f'match: {len(match)}')
    print(f'possible match: {len(possible_match)}')
    print(f'-'*10)

n: 25
unique/all: 1.0
match: 0
possible match: 1
----------
n: 30
unique/all: 1.0
match: 0
possible match: 1
----------
n: 50
unique/all: 1.0
match: 0
possible match: 1
----------
n: 70
unique/all: 0.9857142857142858
match: 0
possible match: 1
----------
n: 100
unique/all: 0.99
match: 0
possible match: 1
----------
n: 200
unique/all: 0.985
match: 0
possible match: 1
----------
n: 300
unique/all: 0.9866666666666667
match: 0
possible match: 1
----------
n: 500
unique/all: 0.979
match: 0
possible match: 1
----------
n: 1000
unique/all: 0.9635
match: 0
possible match: 1
----------
n: 2000
unique/all: 0.93925
match: 0
possible match: 1
----------
n: 3000
unique/all: 0.9165
match: 0
possible match: 1
----------


In [554]:
len(match)

2

In [482]:
history = user_df.index.values

In [None]:
set()

In [443]:
candidates[candidates.item_id == c].index.values[0]

87508

In [444]:
candidates_idx_to_item[87508]

199259

In [445]:
candidates_item_to_idx[199259]

87508

In [470]:
fresh_item_id = candidates.item_id.sample(1).values[0]

In [459]:
fresh_idx = []
for c in history:
    i = cand_item_to_idx.get(i, None)
    fresh_idx.append(i)

In [463]:
candidates[candidates.item_id == 139320]

Unnamed: 0,item_id


In [461]:
for c in history:
    print(c)

139320
16254
78910
166978
38305
2827
138487
14773
183432
128613
137317
47983
220340
20023
67863
151453
79828
17345
1968
117212
3122
119238
73431
39165
152525
208491
30818
27048
7184
185678
1305
142546
88917
221061
60658
169280
11481
47845
97271
130339
117035
158392
163059
101415
109309
41463
119358
58983
195965
185001
40266
165419
83778
106246
171991
132488
221725
116135
120006
88265
66485
102124
80075
168373
209243
194017
216530
136359
212851
216251
211188
535
19180
216503
70705
216113
192419
206949
177655
185224
45810
172328
55004
149949
3417
169615
141152
106711
218816
50266
123372
42481
187535
111648
40918
110907
203359
63
176931
199978
133269
193264
25701
99637
126051
182179
122434
179325
151999
49107
174934
213798
151406
138925
62075
112623
201758
96645
210224
133599
108672
216071
176644
71004
195403
118423
226389
51819
79790
325
5932
44036
102257
184910
67151
171360
157933
124418
41143
20380
71450
51007
8488
46049
49279
68117
145833
66080
74176
153192
199259


In [None]:
neig_ind[user_df]

In [363]:
item_id = candidates.loc[i].values[0]

In [364]:
item_id

51

In [365]:
items.loc[item_id]

source_id                                                     11485
embeddings        [0.12811263, 0.036777616, 0.022303063, 0.00316...
mean_timespent                                             0.111732
mean_reaction                                             0.0111732
amount                                                          179
Name: 51, dtype: object

In [366]:
items.loc[neig_ind[i]]

Unnamed: 0_level_0,source_id,embeddings,mean_timespent,mean_reaction,amount
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20,12210,"[0.03040367, 0.025111973, -0.069858946, -0.009...",0.148936,0.010638,94
50121,12659,"[0.037069675, -0.041974667, -0.04673124, 0.002...",0.492958,0.014085,71
212126,20996,"[0.044716157, -0.0057055512, -0.008015964, -0....",0.611111,0.055556,18
14197,18471,"[0.05832005, 0.0027167068, -0.025937278, 0.008...",0.170213,0.021277,94
140111,9540,"[0.0700784, -0.018452901, -0.04222671, 0.00734...",1.109244,0.030812,357


In [329]:
candidates[candidates.item_id == i]

Unnamed: 0,item_id
1,2


# check carefully what a fuck is going on with labels

In [280]:
user_df = df[df.user_id == user_id]

In [281]:
user_embeddings = get_embeddings(user_df.item_id.values[:self.n_items_count])

In [282]:
user_embeddings.shape

(151, 312)

In [292]:
self.knn.kneighbors(user_embeddings)

(array([[6.8249696e-01, 7.3914295e-01, 7.4066943e-01, 7.5404269e-01,
         7.6515061e-01],
        [5.2277470e-01, 5.7075566e-01, 5.7368457e-01, 5.7821119e-01,
         5.8842313e-01],
        [5.7249063e-01, 5.7613218e-01, 5.8045805e-01, 5.8817160e-01,
         5.8943647e-01],
        [6.2465531e-01, 6.5118319e-01, 6.6899240e-01, 6.7962229e-01,
         6.9155681e-01],
        [5.1917541e-01, 5.2394187e-01, 5.6831175e-01, 5.7497370e-01,
         5.8280152e-01],
        [5.3060043e-01, 5.5699486e-01, 5.8415443e-01, 6.1538774e-01,
         6.2320060e-01],
        [5.8200574e-01, 5.8470976e-01, 5.8758330e-01, 5.8915806e-01,
         6.0476720e-01],
        [5.4224759e-01, 5.6902158e-01, 5.7022178e-01, 5.9481055e-01,
         5.9512228e-01],
        [4.7959250e-01, 5.4697788e-01, 5.4958856e-01, 5.5396068e-01,
         5.6169659e-01],
        [5.4307508e-01, 5.5502892e-01, 5.6615019e-01, 5.7259989e-01,
         5.7333714e-01],
        [4.7004932e-01, 5.1780087e-01, 5.2244169e-01, 5.3665

In [283]:
dist, neighbors = self.knn.kneighbors(user_embeddings)

In [289]:
neighbors

array([[132,  14,  52, 148,  37],
       [ 32, 240, 135, 111, 156],
       [134, 198, 201, 112,  74],
       [207,  57, 275, 277,  54],
       [ 28, 111, 135, 139,  45],
       [ 45, 260, 296, 239, 134],
       [260, 166, 232,  45, 148],
       [ 28, 139, 111,  96, 195],
       [ 28,  65,  18, 139, 111],
       [111, 139,  28, 195,  96],
       [111,  96, 135,  18, 171],
       [ 56,  80, 110, 178, 134],
       [ 96, 135,  45, 265, 171],
       [ 89, 135,  96, 239, 111],
       [111,  43, 171, 239, 208],
       [239,  28, 135,  45,  89],
       [265,  79, 101, 146, 152],
       [112, 250, 283, 227, 113],
       [135, 134, 139,  32,  28],
       [235, 279, 190, 164, 134],
       [158,  74, 134, 148, 229],
       [ 28,  89,  96, 156, 260],
       [ 89, 262, 135, 139, 156],
       [248, 247, 252, 117, 246],
       [113, 156,  40,  70, 261],
       [135, 265,  96,  89, 111],
       [ 45,  28, 260, 134, 139],
       [134, 208,  65, 110, 201],
       [156,  89,  96,  32,  45],
       [ 33, 2

In [288]:
dist.shape

(151, 5)

In [285]:
neighbors.shape

(151, 5)

In [None]:
user_embeddings = get_embeddings(user_df.item_id.values[:self.n_items_count])
dist, neighbors = self.knn.kneighbors(user_embeddings)

candidate_item_ids = self.candidates.loc[neighbors.flatten()].item_id.values
candidates = items.loc[candidate_item_ids]
candidates['dist'] = dist.flatten()

seen_item_ids = list(set(user_df.item_id.values) & set(candidate_item_ids))

candidates = candidates.drop(seen_item_ids)
candidates = candidates[~candidates.index.duplicated(keep='first')]

candidates = candidates.sort_values(by='mean_timespent')

In [254]:
pred = model.predict(user_id, user_df)

In [255]:
pred

array([4105,  197, 9021, 1863, 2111, 7032, 1531, 4579, 3827, 7176, 2216,
       3842,  298,  348, 8565,  579, 8360, 3470, 7442, 4748])

In [222]:
predictions = []
for user_id, subdf in tqdm(df.groupby('user_id'), total=len(df.user_id.unique())):
    if user_id in testset:
        pred = model.predict(user_id, subdf)
        predictions.append((user_id, pred))

100%|██████████| 1000183/1000183 [08:21<00:00, 1992.79it/s]
