In [1]:
from tqdm import tqdm
from copy import deepcopy
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt

import implicit as imp

from scipy import sparse

In [2]:
# !pip install fastparquet
# !pip install pyarrow

In [3]:
TIMESPENT_COEF = 1
REACTION_COEF = 10

In [4]:
# train упорядочен хронологически
df = pd.read_parquet('train.parquet.gzip')
df.dtypes

user_id      int32
item_id      int32
timespent     int8
reaction      int8
dtype: object

In [5]:
# в items_meta для каждого item_id его автор и эмбеддинг содержания
items = pd.read_parquet(f'items_meta.parquet.gzip')
items.head(3)

Unnamed: 0,item_id,source_id,embeddings
0,0,7340,"[0.10458118, 0.047880154, 0.030944156, -0.0351..."
1,1,6284,"[0.035625108, -0.039264094, -0.03310334, -0.04..."
2,2,12766,"[0.08418761, 0.006732465, -0.0037112322, -0.02..."


In [6]:
items = items.set_index('item_id')

In [7]:
items['mean_timespent'] = df.groupby('item_id')['timespent'].mean().values

In [8]:
items['mean_reaction'] = df.groupby('item_id')['reaction'].mean().values

In [9]:
items['amount'] = df.groupby('item_id').size()

In [10]:
items['amount'].describe().astype(int)

count    227606
mean        634
std        1959
min           3
25%          68
50%         172
75%         490
max      122986
Name: amount, dtype: int64

In [11]:
np.corrcoef(items['amount'], items['mean_timespent'])

array([[1.       , 0.0991944],
       [0.0991944, 1.       ]])

In [12]:
np.corrcoef(items['amount'], items['mean_reaction'])

array([[ 1.        , -0.06108216],
       [-0.06108216,  1.        ]])

In [13]:
np.corrcoef(items['mean_timespent'], items['mean_reaction'])

array([[1.        , 0.28293847],
       [0.28293847, 1.        ]])

# Pipeline

In [14]:
VAL_SIZE = 0.2

In [15]:
VAL_N = int(len(df) * VAL_SIZE)

In [16]:
val = df[-VAL_N:].copy()

In [17]:
train = df[:-VAL_N]

In [18]:
user_id = 707536

In [19]:
def get_embeddings(items_id):
    X = []
    for emb in items.loc[items_id].embeddings:
        v = np.array(emb)
        X.append(v)
    X = np.array(X)
    return X

In [20]:
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors

In [21]:
knn = NearestNeighbors(metric='cosine', n_neighbors=3)

In [24]:
X = get_embeddings(items.index)

In [25]:
knn.fit(X)

NearestNeighbors(metric='cosine', n_neighbors=3)

In [26]:
user_id = np.random.choice(df.user_id.values)

In [27]:
user_vecs = get_embeddings(df[df.user_id == user_id].item_id.values)

In [28]:
target_items = df[df.user_id == user_id].item_id.values

In [30]:
len(target_items)

412

In [31]:
# vec = X[0]

In [32]:
dist, neighbors = knn.kneighbors(user_vecs)

In [33]:
neighbors.shape

(412, 3)

In [34]:
len(set(neighbors.flatten())) / len(list(neighbors.flatten()))

0.8932038834951457

In [35]:
len(set(neighbors.flatten()) & set(target_items)) / len(target_items)

1.0

In [38]:
len(set(neighbors.flatten()) & set(target_items)) / len(items)

0.0018101456024885109

In [582]:
class KNNRecommender:
    def __init__(self, top_n=20, n_neighbors=100, n_items_count=300):
        self.top_n = top_n
        self.n_items_count = n_items_count
        self.knn = NearestNeighbors(n_neighbors=n_neighbors)
        
    def fit(self, candidates):
        self.embeddings = get_embeddings(candidates.item_id.values)
        self.knn.fit(self.embeddings)
        self.candidates = candidates
    
    def predict(self, user_id, user_df):
        # get user history
        user_embeddings = get_embeddings(user_df.item_id.values[:self.n_items_count])
        dist, neighbors = self.knn.kneighbors(user_embeddings)

        candidate_item_ids = self.candidates.loc[neighbors.flatten()].item_id.values
        candidates = items.loc[candidate_item_ids]
        candidates['dist'] = dist.flatten()
        
        seen_item_ids = list(set(user_df.item_id.values) & set(candidate_item_ids))
        
        candidates = candidates.drop(seen_item_ids)
        candidates = candidates[~candidates.index.duplicated(keep='first')]
        
        candidates = candidates.sort_values(by='mean_timespent')
        
        return candidates.index.values[-self.top_n:]

In [583]:
test = pd.read_parquet('test.parquet.gzip')
# test = test.set_index('user_id')
test.head(3)

Unnamed: 0,user_id
0,7
1,8
2,9


In [584]:
candidates = pd.read_parquet(f'fresh_candidates.parquet.gzip')
candidates.head(3)

Unnamed: 0,item_id
0,0
1,2
2,5


In [447]:
cand_idx_to_item = dict(zip(candidates.index.values, candidates.item_id.values))

In [450]:
cand_item_to_idx = dict(zip(candidates.item_id.values, candidates.index.values))

In [257]:
model = KNNRecommender(top_n=20, n_items_count=300)
model.fit(candidates)

In [279]:
self = KNNRecommender(top_n=20, n_items_count=300)
self.fit(candidates)

In [522]:
knn = NearestNeighbors(metric='cosine')

In [523]:
X = get_embeddings(candidates.index)

In [524]:
all_embeddings = get_embeddings(items.index)

In [525]:
fresh_embeddings = get_embeddings(candidates.index)

In [528]:
knn.fit(fresh_embeddings)

NearestNeighbors(metric='cosine')

In [474]:
# iloc -> numeric
# loc -> item_id

In [530]:
user_df = df[df.user_id == user_id]

In [531]:
user_df = user_df.set_index('item_id')

In [532]:
user_embeddings = get_embeddings(user_df.index)

In [544]:
neig_dist, neig_ind = knn.kneighbors(user_embeddings, n_neighbors=10)

In [545]:
neig_ind.shape

(151, 10)

In [546]:
recs = []
for i in neig_ind.flatten():
    recs.append(cand_idx_to_item[i])

In [547]:
len(recs)

1510

In [548]:
len(set(recs))

1157

In [549]:
history = user_df.index.values

In [550]:
len(history)

151

In [1]:
a = set([1, 2, 3])
b = set([2, 3, 4])

In [3]:
a -= b

In [4]:
a

{1}

In [551]:
match = set(history) & set(recs)

In [552]:
possible_match = set(history) & set(candidates.item_id.values)

In [553]:
len(possible_match)

86

In [580]:
last_n = 2

In [581]:
for n in [25, 30, 50, 70, 100, 200, 300, 500, 1000, 2000, 3000]:
    print(f'n: {n}')
    
    neig_dist, neig_ind = knn.kneighbors(user_embeddings[-last_n:], n_neighbors=n)
    recs = []
    for i in neig_ind.flatten():
        recs.append(cand_idx_to_item[i])
    
    match = set(history[-last_n:]) & set(recs)
    possible_match = set(history[-last_n:]) & set(candidates.item_id.values)
    
    print(f'unique/all: {len(set(recs)) / len(recs)}')
    print(f'match: {len(match)}')
    print(f'possible match: {len(possible_match)}')
    print(f'-'*10)

n: 25
unique/all: 1.0
match: 0
possible match: 1
----------
n: 30
unique/all: 1.0
match: 0
possible match: 1
----------
n: 50
unique/all: 1.0
match: 0
possible match: 1
----------
n: 70
unique/all: 0.9857142857142858
match: 0
possible match: 1
----------
n: 100
unique/all: 0.99
match: 0
possible match: 1
----------
n: 200
unique/all: 0.985
match: 0
possible match: 1
----------
n: 300
unique/all: 0.9866666666666667
match: 0
possible match: 1
----------
n: 500
unique/all: 0.979
match: 0
possible match: 1
----------
n: 1000
unique/all: 0.9635
match: 0
possible match: 1
----------
n: 2000
unique/all: 0.93925
match: 0
possible match: 1
----------
n: 3000
unique/all: 0.9165
match: 0
possible match: 1
----------


In [554]:
len(match)

2

In [482]:
history = user_df.index.values

In [None]:
set()

In [443]:
candidates[candidates.item_id == c].index.values[0]

87508

In [444]:
candidates_idx_to_item[87508]

199259

In [445]:
candidates_item_to_idx[199259]

87508

In [470]:
fresh_item_id = candidates.item_id.sample(1).values[0]

In [459]:
fresh_idx = []
for c in history:
    i = cand_item_to_idx.get(i, None)
    fresh_idx.append(i)

In [463]:
candidates[candidates.item_id == 139320]

Unnamed: 0,item_id


In [461]:
for c in history:
    print(c)

139320
16254
78910
166978
38305
2827
138487
14773
183432
128613
137317
47983
220340
20023
67863
151453
79828
17345
1968
117212
3122
119238
73431
39165
152525
208491
30818
27048
7184
185678
1305
142546
88917
221061
60658
169280
11481
47845
97271
130339
117035
158392
163059
101415
109309
41463
119358
58983
195965
185001
40266
165419
83778
106246
171991
132488
221725
116135
120006
88265
66485
102124
80075
168373
209243
194017
216530
136359
212851
216251
211188
535
19180
216503
70705
216113
192419
206949
177655
185224
45810
172328
55004
149949
3417
169615
141152
106711
218816
50266
123372
42481
187535
111648
40918
110907
203359
63
176931
199978
133269
193264
25701
99637
126051
182179
122434
179325
151999
49107
174934
213798
151406
138925
62075
112623
201758
96645
210224
133599
108672
216071
176644
71004
195403
118423
226389
51819
79790
325
5932
44036
102257
184910
67151
171360
157933
124418
41143
20380
71450
51007
8488
46049
49279
68117
145833
66080
74176
153192
199259


In [460]:
fresh_idx

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [None]:
neig_ind[user_df]

In [363]:
item_id = candidates.loc[i].values[0]

In [364]:
item_id

51

In [365]:
items.loc[item_id]

source_id                                                     11485
embeddings        [0.12811263, 0.036777616, 0.022303063, 0.00316...
mean_timespent                                             0.111732
mean_reaction                                             0.0111732
amount                                                          179
Name: 51, dtype: object

In [366]:
items.loc[neig_ind[i]]

Unnamed: 0_level_0,source_id,embeddings,mean_timespent,mean_reaction,amount
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20,12210,"[0.03040367, 0.025111973, -0.069858946, -0.009...",0.148936,0.010638,94
50121,12659,"[0.037069675, -0.041974667, -0.04673124, 0.002...",0.492958,0.014085,71
212126,20996,"[0.044716157, -0.0057055512, -0.008015964, -0....",0.611111,0.055556,18
14197,18471,"[0.05832005, 0.0027167068, -0.025937278, 0.008...",0.170213,0.021277,94
140111,9540,"[0.0700784, -0.018452901, -0.04222671, 0.00734...",1.109244,0.030812,357


In [329]:
candidates[candidates.item_id == i]

Unnamed: 0,item_id
1,2


# check carefully what a fuck is going on with labels

In [280]:
user_df = df[df.user_id == user_id]

In [281]:
user_embeddings = get_embeddings(user_df.item_id.values[:self.n_items_count])

In [282]:
user_embeddings.shape

(151, 312)

In [292]:
self.knn.kneighbors(user_embeddings)

(array([[6.8249696e-01, 7.3914295e-01, 7.4066943e-01, 7.5404269e-01,
         7.6515061e-01],
        [5.2277470e-01, 5.7075566e-01, 5.7368457e-01, 5.7821119e-01,
         5.8842313e-01],
        [5.7249063e-01, 5.7613218e-01, 5.8045805e-01, 5.8817160e-01,
         5.8943647e-01],
        [6.2465531e-01, 6.5118319e-01, 6.6899240e-01, 6.7962229e-01,
         6.9155681e-01],
        [5.1917541e-01, 5.2394187e-01, 5.6831175e-01, 5.7497370e-01,
         5.8280152e-01],
        [5.3060043e-01, 5.5699486e-01, 5.8415443e-01, 6.1538774e-01,
         6.2320060e-01],
        [5.8200574e-01, 5.8470976e-01, 5.8758330e-01, 5.8915806e-01,
         6.0476720e-01],
        [5.4224759e-01, 5.6902158e-01, 5.7022178e-01, 5.9481055e-01,
         5.9512228e-01],
        [4.7959250e-01, 5.4697788e-01, 5.4958856e-01, 5.5396068e-01,
         5.6169659e-01],
        [5.4307508e-01, 5.5502892e-01, 5.6615019e-01, 5.7259989e-01,
         5.7333714e-01],
        [4.7004932e-01, 5.1780087e-01, 5.2244169e-01, 5.3665

In [283]:
dist, neighbors = self.knn.kneighbors(user_embeddings)

In [289]:
neighbors

array([[132,  14,  52, 148,  37],
       [ 32, 240, 135, 111, 156],
       [134, 198, 201, 112,  74],
       [207,  57, 275, 277,  54],
       [ 28, 111, 135, 139,  45],
       [ 45, 260, 296, 239, 134],
       [260, 166, 232,  45, 148],
       [ 28, 139, 111,  96, 195],
       [ 28,  65,  18, 139, 111],
       [111, 139,  28, 195,  96],
       [111,  96, 135,  18, 171],
       [ 56,  80, 110, 178, 134],
       [ 96, 135,  45, 265, 171],
       [ 89, 135,  96, 239, 111],
       [111,  43, 171, 239, 208],
       [239,  28, 135,  45,  89],
       [265,  79, 101, 146, 152],
       [112, 250, 283, 227, 113],
       [135, 134, 139,  32,  28],
       [235, 279, 190, 164, 134],
       [158,  74, 134, 148, 229],
       [ 28,  89,  96, 156, 260],
       [ 89, 262, 135, 139, 156],
       [248, 247, 252, 117, 246],
       [113, 156,  40,  70, 261],
       [135, 265,  96,  89, 111],
       [ 45,  28, 260, 134, 139],
       [134, 208,  65, 110, 201],
       [156,  89,  96,  32,  45],
       [ 33, 2

In [288]:
dist.shape

(151, 5)

In [285]:
neighbors.shape

(151, 5)

In [None]:
user_embeddings = get_embeddings(user_df.item_id.values[:self.n_items_count])
dist, neighbors = self.knn.kneighbors(user_embeddings)

candidate_item_ids = self.candidates.loc[neighbors.flatten()].item_id.values
candidates = items.loc[candidate_item_ids]
candidates['dist'] = dist.flatten()

seen_item_ids = list(set(user_df.item_id.values) & set(candidate_item_ids))

candidates = candidates.drop(seen_item_ids)
candidates = candidates[~candidates.index.duplicated(keep='first')]

candidates = candidates.sort_values(by='mean_timespent')

In [254]:
pred = model.predict(user_id, user_df)

In [255]:
pred

array([4105,  197, 9021, 1863, 2111, 7032, 1531, 4579, 3827, 7176, 2216,
       3842,  298,  348, 8565,  579, 8360, 3470, 7442, 4748])

In [222]:
predictions = []
for user_id, subdf in tqdm(df.groupby('user_id'), total=len(df.user_id.unique())):
    if user_id in testset:
        pred = model.predict(user_id, subdf)
        predictions.append((user_id, pred))

100%|██████████| 1000183/1000183 [08:21<00:00, 1992.79it/s]


In [223]:
map_test = dict(predictions)

In [227]:
map_test[user_id].values

array([46, 15, 22,  7, 59, 27, 47, 65, 42, 31, 17,  5,  6,  0, 37, 29, 35,
       52, 53,  2])

In [228]:
test_predictions = []
for user_id in test.user_id:
    test_predictions.append(map_test[user_id].values)
test['predictions'] = test_predictions

In [229]:
test.head(3)

Unnamed: 0,user_id,predictions
0,7,"[36, 46, 22, 7, 59, 27, 47, 65, 31, 63, 5, 6, ..."
1,8,"[60, 51, 11, 46, 15, 22, 7, 47, 65, 42, 17, 63..."
2,9,"[51, 11, 62, 46, 22, 7, 59, 27, 47, 65, 31, 17..."


In [231]:
test.tail(3)

Unnamed: 0,user_id,predictions
199997,1000166,"[54, 36, 11, 46, 15, 22, 7, 47, 65, 42, 32, 31..."
199998,1000168,"[46, 22, 7, 59, 47, 65, 42, 31, 17, 63, 5, 6, ..."
199999,1000172,"[46, 15, 22, 7, 59, 27, 47, 65, 42, 31, 17, 5,..."


In [230]:
test.to_parquet('sample_submission_knn.parquet.gzip', compression='gzip', engine='pyarrow')