В данном ноутбуке реализовывается item_based подход

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../src/')

In [3]:
import numpy as np
import scipy.sparse as sp
from sklearn.preprocessing import normalize
from reindexing import reindexing_clientid_itemid
import pandas as pd; pd.set_option('display.max_columns', None)
from implicit._nearest_neighbours import all_pairs_knn
pd.options.mode.chained_assignment = None

In [4]:
# from google.colab import drive
# drive.mount('/content/gdrive')

### Считывание и предобработка данных

In [5]:
user_actions = pd.read_parquet('../data/user_actions_august.parquet')

user_actions['clientid'] = user_actions['clientid'].astype('uint32')
user_actions['itemid'] = user_actions['itemid'].astype('uint32')
user_actions['action_type'] = user_actions['action_type'].map({'view': 0, 'to_cart': 1}).astype('bool')

print(user_actions.shape)
user_actions.head()

(57585220, 4)


Unnamed: 0,clientid,itemid,action_type,timestamp
0,179153,135391270,False,2019-08-06 15:14:45
1,179153,135391272,False,2019-08-06 15:16:29
2,179153,135391270,False,2019-08-08 09:07:29
3,179153,142132354,False,2019-08-20 17:59:58
4,179153,140487634,False,2019-08-23 16:46:39


In [6]:
train_target = pd.read_csv('../data/train_with_features.csv')

train_target['clientid'] = train_target['clientid'].astype('uint32')
train_target['jointitemid'] = train_target['jointitemid'].astype('uint32')
train_target['label'] = train_target['label'].astype('int8')
train_target['novelty_cnt'] = train_target['novelty_cnt'].astype('int32')
train_target['count_on_session_view'] = train_target['count_on_session_view'].astype('int32')
train_target['count_on_session_tocart'] = train_target['count_on_session_tocart'].astype('int32')

train_target.shape

(1390438, 21)

In [7]:
test_target = pd.read_csv('../data/test_with_features.csv')

test_target ['clientid'] = test_target ['clientid'].astype('uint32')
test_target ['jointitemid'] = test_target ['jointitemid'].astype('uint32')
test_target ['label'] = test_target ['label'].astype('int8')
test_target ['novelty_cnt'] = test_target ['novelty_cnt'].astype('int32')
test_target ['count_on_session_view'] = test_target['count_on_session_view'].astype('int32')
test_target ['count_on_session_tocart'] = test_target ['count_on_session_tocart'].astype('int32')

test_target.shape

(597158, 21)

Возвращаем три датафрейма с новой индексацией для просмотров товара.

In [8]:
user_actions_view = (
    user_actions[~user_actions['action_type']]
    .reset_index(drop=True)
    .drop(['timestamp', 'action_type'], axis=1)
)

user_actions_view, user_mapping, item_mapping = reindexing_clientid_itemid(user_actions_view)

Возвращаем три датафрейма с новой индексацией для добавления в корзину.

In [9]:
user_actions_tocart = (
    user_actions[user_actions['action_type']]
    .reset_index(drop=True)
    .drop(['timestamp', 'action_type'], axis=1)
)

user_actions_tocart, user_mapping_tocart, item_mapping_tocart = reindexing_clientid_itemid(user_actions_tocart)

### Train

In [10]:
# по просмотрам
ib_train_target_view = train_target[['clientid', 'itemid', 'jointitemid']].copy()

ib_train_target_view['user_cat'] = (
    ib_train_target_view['clientid']
    .map(user_mapping.set_index('old')['new'])
)

ib_train_target_view['jointitem_cat'] = (
    ib_train_target_view['jointitemid']
    .map(item_mapping.set_index('old')['new'])
)

In [11]:
# по добавлениям в корзину
ib_train_target_tocart = train_target[['clientid', 'itemid', 'jointitemid']].copy()

ib_train_target_tocart['user_cat'] = (
    ib_train_target_tocart['clientid']
    .map(user_mapping_tocart.set_index('old')['new'])
)

ib_train_target_tocart['jointitem_cat'] = (
    ib_train_target_tocart['jointitemid']
    .map(item_mapping_tocart.set_index('old')['new'])
)

### Test

In [12]:
# по просмотрам
ib_test_target_view = test_target[['clientid','itemid','jointitemid']].copy()

ib_test_target_view['user_cat'] = (
    ib_test_target_view['clientid']
    .map(user_mapping.set_index('old')['new'])
)

ib_test_target_view['jointitem_cat'] = (
    ib_test_target_view['jointitemid']
    .map(item_mapping.set_index('old')['new'])
)

In [13]:
# по добавлениям в корзину
ib_test_target_tocart = test_target[['clientid','itemid','jointitemid']].copy()

ib_test_target_tocart['user_cat'] = (
    ib_test_target_tocart['clientid']
    .map(user_mapping_tocart.set_index('old')['new'])
)

ib_test_target_tocart['jointitem_cat'] = (
    ib_test_target_tocart['jointitemid']
    .map(item_mapping_tocart.set_index('old')['new'])
)

### Создание матриц для просмотров и добавлений в корзину

In [14]:
# Для просмотров
shape_view = (
    user_mapping['new'].max()+1,
    item_mapping['new'].max()+1
)

user_item_view = sp.csr_matrix(arg1=(np.ones_like(user_actions_view['clientid'].values), 
                                    (user_actions_view['clientid'].values, user_actions_view['itemid'].values)), 
                               shape=shape_view)

# all_pairs_knn расчитывает приближенную матрицу cхожестей (по N ближайшим соседям)
N = 201
item_similarity_matrix_view = all_pairs_knn(normalize(user_item_view, axis=0).T,
                                            N, show_progress=True, num_threads=2).tocsr()

  0%|          | 0/2374782 [00:00<?, ?it/s]

In [15]:
item_similarity_matrix_view.setdiag(0.)
item_similarity_matrix_view.eliminate_zeros()

item_similarity_matrix_view.shape

  self._set_arrayXarray(i, j, x)


(2374782, 2374782)

In [16]:
# Для добавлений в корзину
shape_tocart = (
    user_mapping_tocart['new'].max()+1,
    item_mapping_tocart['new'].max()+1
)

user_item_tocart = sp.csr_matrix(arg1=(np.ones_like(user_actions_tocart['clientid'].values), 
                                       (user_actions_tocart['clientid'].values, user_actions_tocart['itemid'].values)),
                                 shape=shape_tocart)

# all_pairs_knn расчитывает приближенную матрицу cхожестей (по N ближайшим соседям)
N = 201
item_similarity_matrix_tocart = all_pairs_knn(normalize(user_item_tocart, axis=0).T, 
                                              N, show_progress=True, num_threads=2).tocsr()

  0%|          | 0/1069443 [00:00<?, ?it/s]

In [17]:
item_similarity_matrix_tocart.setdiag(0.)
item_similarity_matrix_tocart.eliminate_zeros()

item_similarity_matrix_tocart.shape

  self._set_arrayXarray(i, j, x)


(1069443, 1069443)

### Расчет схожести рекомендованного товара

In [18]:
def score_item_based(data: pd.DataFrame,
                     item_similarity_matrix: sp.csr_matrix,
                     user_item: sp.csr_matrix,
                     name_score: str):
    """
    Функция схожести рекомендованного товара.
    """
    
    pairs = data.copy()
    
    pairs = pairs.dropna()[['user_cat', 'jointitem_cat']]
    pairs['user_cat'] = pairs['user_cat'].astype('uint32')
    pairs['jointitem_cat'] = pairs['jointitem_cat'].astype('uint32')
    pairs[name_score] = (
        user_item[pairs['user_cat'].values]
        .multiply(item_similarity_matrix[pairs['jointitem_cat'].values])
        .sum(axis=1)
    )
    
    data = (
        data
        .merge(pairs, on=['user_cat', 'jointitem_cat'], how='left')
        .drop_duplicates()
        .reset_index(drop=True)
    )
    
    return data

### Для train выборки

In [19]:
df_train_view = (
    score_item_based(ib_train_target_view, 
                     item_similarity_matrix_view, 
                     user_item_view, 
                     'item_based_view')
)

df_train_tocart = (
    score_item_based(ib_train_target_tocart, 
                     item_similarity_matrix_tocart,
                     user_item_tocart,
                     'item_based_tocart')
)

df_train_view = df_train_view.drop(['user_cat', 'jointitem_cat'], axis=1)
df_train_tocart = df_train_tocart.drop(['user_cat','jointitem_cat'], axis=1)

In [20]:
print(train_target.shape)
train = (
    train_target
    .merge(df_train_view[['clientid', 'itemid', 'jointitemid',
                          'item_based_view']], how='left')
    .drop_duplicates()
    .reset_index(drop=True)
)

train = (
    train
    .merge(df_train_tocart[['clientid', 'itemid', 'jointitemid',
                            'item_based_tocart']], how='left')
    .drop_duplicates()
    .reset_index(drop=True)
)

train = train.fillna(0)
print(train.shape)

(1390438, 21)
(1390438, 23)


### Для test выборки

In [21]:
df_test_view = (
    score_item_based(ib_test_target_view,
                     item_similarity_matrix_view,
                     user_item_view,
                     'item_based_view')
)

df_test_tocart = (
    score_item_based(ib_test_target_tocart,
                     item_similarity_matrix_tocart,
                     user_item_tocart,
                     'item_based_tocart')
)

df_test_view = df_test_view.drop(['user_cat','jointitem_cat'], axis=1)
df_test_tocart = df_test_tocart.drop(['user_cat','jointitem_cat'], axis=1)

In [22]:
print(test_target.shape)
test = (
    test_target
    .merge(df_test_view[['clientid', 'itemid', 'jointitemid', 
                         'item_based_view']], how='left')
    .drop_duplicates()
    .reset_index(drop=True)
)

test = (
    test
    .merge(df_test_tocart[['clientid', 'itemid', 'jointitemid',
                           'item_based_tocart']], how='left')
    .drop_duplicates()
    .reset_index(drop=True)
)

test = test.fillna(0)
print(test.shape)

(597158, 21)
(597158, 23)


### Сохраняем принаки

In [23]:
train.to_csv('../data/train_with_features.csv', index=False)
test.to_csv('../data/test_with_features.csv', index=False)