В данном ноутбуке реализована схожесть товаров по пользовательским сессиям.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../src/')

In [3]:
import numpy as np
from reindexing import (
    reindexing_clientid_itemid, reindexing_sessionid_itemid
)
import scipy.sparse as sp
import datetime as dt
from cosine_similarity import calculate_session_similarity
import pandas as pd; pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [4]:
# from google.colab import drive
# drive.mount('/content/gdrive')

### Считывание и предобработка данных

In [5]:
train_target = pd.read_csv('../data/train_with_features.csv')

train_target['clientid'] = train_target['clientid'].astype('uint32')
train_target['jointitemid'] = train_target['jointitemid'].astype('uint32')
train_target['novelty_cnt'] = train_target['novelty_cnt'].astype('uint32')
train_target['itemid'] = train_target['itemid'].astype('uint32')
train_target['label'] = train_target['label'].astype('uint8')

print(train_target.shape)

(1390438, 17)


In [6]:
test_target = pd.read_csv('../data/test_with_features.csv')

test_target['clientid'] = test_target['clientid'].astype('uint32')
test_target['jointitemid'] = test_target['jointitemid'].astype('uint32')
test_target['novelty_cnt'] = test_target['novelty_cnt'].astype('uint32')
test_target['itemid'] = test_target['itemid'].astype('uint32')
test_target['label'] = test_target['label'].astype('uint8')

print(test_target.shape)

(597158, 17)


In [7]:
sessions = pd.read_parquet('../data/sessions.parquet')

sessions['sessionid'] = sessions['sessionid'].astype('uint32')
sessions['itemid'] = sessions['itemid'].astype('uint32')
sessions['action_type'] = sessions['action_type'].astype('bool')
sessions['event_date'] = sessions['event_date'].apply(dt.datetime.toordinal).astype('uint32')

print(sessions.shape)
sessions.head()

(38956702, 4)


Unnamed: 0,sessionid,itemid,action_type,event_date
0,0,146494488,False,737290
1,0,147656978,False,737290
2,0,136271000,False,737290
3,0,147019789,False,737290
4,0,32117708,False,737290


Возвращаем три датафрейма с новой индексацией для просмотров товара.

In [8]:
data_view = (
    sessions[~sessions['action_type']]
    .reset_index(drop=True)
    .drop(['event_date', 'action_type'], axis=1)
)

session_view, session_mapping_view, item_mapping_view = reindexing_sessionid_itemid(data_view)

Возвращаем три датафрейма с новой индексацией для добавлений товаров в корзину.

In [9]:
data_tocart = (
    sessions[sessions['action_type']]
    .reset_index(drop=True)
    .drop(['event_date', 'action_type'], axis=1)
)

session_tocart, session_mapping_tocart, item_mapping_tocart = reindexing_sessionid_itemid(data_tocart)

### Создадим матрицу товар/сессия.

In [10]:
# Для просмотров
shape_view = (
    item_mapping_view['new'].max()+1, 
    session_mapping_view['new'].max()+1
)

item_session_view = sp.csr_matrix(arg1=(np.ones_like(session_view['sessionid'].values), 
                                       (session_view['itemid'].values, session_view['sessionid'].values)),
                                  shape=shape_view)
item_session_view.shape

(2001182, 5549526)

In [11]:
# Для добавлений в корзину
shape_tocart = (
    item_mapping_tocart['new'].max()+1,
    session_mapping_tocart['new'].max()+1
)

item_session_tocart = sp.csr_matrix(arg1=(np.ones_like(session_tocart['sessionid'].values), 
                                       (session_tocart['itemid'].values, session_tocart['sessionid'].values)),
                                    shape=shape_tocart)
item_session_tocart.shape

(889787, 2240637)

### Посчитаем признаки косинусной схожести для сессий

### Train

In [12]:
train_target_view = train_target.drop(columns=['timestamp', 'label'])

train_target_view['item_cat'] = (
    train_target_view['itemid']
    .map(item_mapping_view.set_index('old')['new'])
)

train_target_view['jointitem_cat'] = (
    train_target_view['jointitemid']
    .map(item_mapping_view.set_index('old')['new'])
)

In [13]:
train_target_tocart = train_target.drop(columns=['timestamp', 'label'])

train_target_tocart['item_cat'] = (
    train_target_tocart['itemid']
    .map(item_mapping_tocart.set_index('old')['new'])
)

train_target_tocart['jointitem_cat'] = (
    train_target_tocart['jointitemid']
    .map(item_mapping_tocart.set_index('old')['new'])
)

In [14]:
train_view = calculate_session_similarity(train_target_view, item_session_view, 'view')
train_tocart = calculate_session_similarity(train_target_tocart, item_session_tocart, 'tocart')

In [15]:
print(train_target.shape)

train = (
    train_target
    .merge(train_view[['itemid', 'jointitemid', 'same_items_on_session_view',
                       'count_on_session_view']], how='left')
    .drop_duplicates()
    .reset_index(drop=True)
)

train = (
    train
    .merge(train_tocart[['itemid','jointitemid','same_items_on_session_tocart',
                         'count_on_session_tocart']], how='left')
    .drop_duplicates()
    .reset_index(drop=True)
)

print(train.shape)

(1390438, 17)
(1390438, 21)


In [16]:
train = train.fillna(0)
train['count_on_session_view'] = train['count_on_session_view'].astype('uint32')
train['count_on_session_tocart'] = train['count_on_session_tocart'].astype('uint32')

train.head(2)

Unnamed: 0,clientid,itemid,jointitemid,label,timestamp,view_cnt,to_cart_cnt,ctr,novelty_cnt,mean_amount_per_day_view,mean_amount_per_day_to_cart,last_day_views_cnt,last_day_to_cart_cnt,relation_ldv_mean,relation_ldtocart_mean,mnk_view,mnk_to_cart,same_items_on_session_view,count_on_session_view,same_items_on_session_tocart,count_on_session_tocart
0,7833842,31499843,138176581,1,2019-09-07 20:11:01,31.0,9.0,0.28125,737272,1.722222,1.5,0.0,0.0,0.0,0.0,0.107143,0.035714,0.0,0,0.069338,1
1,19548158,147389610,148381589,0,2019-08-31 22:32:31,24.0,9.0,0.36,737272,1.6,1.125,0.0,0.0,0.0,0.0,-0.035714,0.0,0.160623,5,0.308607,2


### Test

In [17]:
test_target_view = test_target.drop(columns=['timestamp', 'label'])

test_target_view['item_cat'] = (
    test_target_view['itemid']
    .map(item_mapping_view.set_index('old')['new'])
)

test_target_view['jointitem_cat'] = (
    test_target_view['jointitemid']
    .map(item_mapping_view.set_index('old')['new'])
)

In [18]:
test_target_tocart = test_target.drop(columns=['timestamp', 'label'])

test_target_tocart['item_cat'] = (
    test_target_tocart['itemid']
    .map(item_mapping_tocart.set_index('old')['new'])
)

test_target_tocart['jointitem_cat'] = (
    test_target_tocart['jointitemid'].
    map(item_mapping_tocart.set_index('old')['new'])
)

In [19]:
test_view = calculate_session_similarity(test_target_view, item_session_view, 'view')
test_tocart = calculate_session_similarity(test_target_tocart, item_session_tocart, 'tocart')

In [20]:
print(test_target.shape)

test = (
    test_target.merge(test_view[['itemid', 'jointitemid', 'same_items_on_session_view',
                                 'count_on_session_view']], how='left')
    .drop_duplicates()
    .reset_index(drop=True)
)

test = (
    test.merge(test_tocart[['itemid', 'jointitemid', 'same_items_on_session_tocart',
                            'count_on_session_tocart']], how='left')
    .drop_duplicates()
    .reset_index(drop=True)
)

print(test.shape)

(597158, 17)
(597158, 21)


In [21]:
test = test.fillna(0)
test['count_on_session_view'] = test['count_on_session_view'].astype('uint32')
test['count_on_session_tocart'] = test['count_on_session_tocart'].astype('uint32')

test.head(2)

Unnamed: 0,clientid,itemid,jointitemid,label,timestamp,view_cnt,to_cart_cnt,ctr,novelty_cnt,mean_amount_per_day_view,mean_amount_per_day_to_cart,last_day_views_cnt,last_day_to_cart_cnt,relation_ldv_mean,relation_ldtocart_mean,mnk_view,mnk_to_cart,same_items_on_session_view,count_on_session_view,same_items_on_session_tocart,count_on_session_tocart
0,8081929,152898248,152875664,0,2019-09-06 18:57:23,10.0,2.0,0.181818,737276,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0
1,33378638,144847078,140715321,0,2019-09-02 07:09:31,211.0,19.0,0.089623,737272,7.814815,1.727273,11.0,1.0,1.407583,0.578947,0.0,-0.178571,0.023322,7,0.0,0


### Сохраняем принаки

In [24]:
train.to_csv('../data/train_with_features.csv', index=False)
test.to_csv('../data/test_with_features.csv', index=False)