# Implementing the item_based approach

In [None]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import datetime as dt
from sklearn.preprocessing import normalize
!pip install implicit
from implicit._nearest_neighbours import all_pairs_knn

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive')

## Reading and preprocessing action data, training and test data sets

In [None]:
# переиндиксация индеков клиентов и товаров
def newIndex(user_actions):
    clients = user_actions['clientid'].unique() # все уникальные айди клиентов из user_actions
    clients_cat = np.arange(0, len(clients), dtype='uint32') # массив с элементами от 0 до n - 1

    # таблица, где каждая строка переводит clientid в номер строки
    user_mapping = pd.DataFrame({'old': clients, 'new': clients_cat})

    items = user_actions['itemid'].unique() # все уникальные айди товаров из user_actions
    items_cat = np.arange(0, len(items), dtype='uint32') # массив с элементами от 0 до m - 1

    # таблица, где каждая строка переводит itemid в номер столбца
    item_mapping = pd.DataFrame({'old': items, 'new': items_cat})

    user_actions['clientid'] = user_actions['clientid'].map(user_mapping.set_index('old').new)
    user_actions['itemid'] = user_actions['itemid'].map(item_mapping.set_index('old').new)
    return (user_actions, user_mapping, item_mapping)

In [None]:
user_actions = pd.read_parquet('../data/user_actions_august.parquet')
user_actions.head()

In [None]:
user_actions['clientid'] = user_actions['clientid'].astype('uint32')
user_actions['itemid'] = user_actions['itemid'].astype('uint32')
user_actions['action_type'] = user_actions['action_type'].map({'view': 0, 'to_cart': 1}).astype('bool')

user_actions.head()

In [None]:
user_actions.info()

Dividing data by views

In [None]:
new_user_actions_view = user_actions[~user_actions['action_type']].reset_index(drop=True)
new_user_actions_view = new_user_actions_view.drop(columns=['timestamp', 'action_type'])
user_actions_view, user_mapping, item_mapping = newIndex(new_user_actions_view)

user_actions_view.head()

Dividing data on cart additions

In [None]:
user_actions_tocart = user_actions[user_actions['action_type']].reset_index(drop=True)
user_actions_tocart = user_actions_tocart.drop(columns=['timestamp', 'action_type'])
user_actions_tocart, user_mapping_tocart, item_mapping_tocart = newIndex(user_actions_tocart)

## For training sample

Divide by views and additions to the cart

In [None]:
train_target = pd.read_csv('../data/train_with_scores.csv')
train_target['clientid'] = train_target['clientid'].astype('uint32')
train_target['jointitemid'] = train_target['jointitemid'].astype('uint32')
train_target['label']=train_target['label'].astype('int8')
train_target['novelty_cnt']=train_target['novelty_cnt'].astype('int32')
train_target['count_on_session_view']=train_target['count_on_session_view'].astype('int32')
train_target['count_on_session_to_cart']=train_target['count_on_session_to_cart'].astype('int32')

# по просмотрам
ib_train_target_view = train_target[['clientid','itemid','jointitemid']].copy()
ib_train_target_view['user_cat'] = ib_train_target_view['clientid'].map(user_mapping.set_index('old').new)
ib_train_target_view['jointitem_cat'] = ib_train_target_view['jointitemid'].map(item_mapping.set_index('old').new)

# по картэдам
ib_train_target_tocart = train_target[['clientid','itemid','jointitemid']].copy()
ib_train_target_tocart['user_cat'] = ib_train_target_tocart['clientid'].map(user_mapping_tocart.set_index('old').new)
ib_train_target_tocart['jointitem_cat'] = ib_train_target_tocart['jointitemid'].map(item_mapping_tocart.set_index('old').new)
ib_train_target_tocart.info()

In [None]:
ib_train_target_view.info()

## For test sample

Divide by views and additions to the cart

In [None]:
test_target = pd.read_csv('../data/test_with_scores.csv')

test_target ['clientid'] = test_target ['clientid'].astype('uint32')
test_target ['jointitemid'] = test_target ['jointitemid'].astype('uint32')
test_target ['label'] = test_target ['label'].astype('int8')
test_target ['novelty_cnt'] = test_target ['novelty_cnt'].astype('int32')
test_target ['count_on_session_view'] = test_target['count_on_session_view'].astype('int32')
test_target ['count_on_session_to_cart'] = test_target ['count_on_session_to_cart'].astype('int32')

# по просмотрам
ib_test_target_view = test_target[['clientid','itemid','jointitemid']].copy()
ib_test_target_view['user_cat'] = ib_test_target_view['clientid'].map(user_mapping.set_index('old').new)
ib_test_target_view['jointitem_cat'] = ib_test_target_view['jointitemid'].map(item_mapping.set_index('old').new)

# картэдам
ib_test_target_tocart = test_target[['clientid','itemid','jointitemid']].copy()
ib_test_target_tocart['user_cat'] = ib_test_target_tocart['clientid'].map(user_mapping_tocart.set_index('old').new)
ib_test_target_tocart['jointitem_cat'] = ib_test_target_tocart['jointitemid'].map(item_mapping_tocart.set_index('old').new)
ib_test_target_tocart.info()

In [None]:
ib_test_target_view.info()

## Building matrices for views and adding to cart

In [None]:
# Для просмотров
shape = (user_mapping['new'].max() + 1, item_mapping['new'].max() + 1) # размер матрицы

user_item_view = sp.csr_matrix(arg1=(np.ones_like(user_actions_view['clientid'].values), 
                                       (user_actions_view['clientid'].values, user_actions_view['itemid'].values)), shape=shape)
# all_pairs_knn расчитывает приближенную матрицу похожестей(по N ближайшим соседям)
N = 201
item_similarity_matrix_view = all_pairs_knn(normalize(user_item_view, axis=0).T, N, show_progress=True, num_threads=2).tocsr()

In [None]:
item_similarity_matrix_view.setdiag(0.)
item_similarity_matrix_view.eliminate_zeros()

In [None]:
item_similarity_matrix_view.shape

In [None]:
# Для картэдов
shape = (user_mapping_tocart['new'].max() + 1, item_mapping_tocart['new'].max() + 1) # размер матрицы

user_item_tocart = sp.csr_matrix(arg1=(np.ones_like(user_actions_tocart['clientid'].values), 
                                       (user_actions_tocart['clientid'].values, user_actions_tocart['itemid'].values)), shape=shape)
# all_pairs_knn расчитывает приближенную матрицу похожестей(по N ближайшим соседям)
N = 201
item_similarity_matrix_tocart = all_pairs_knn(normalize(user_item_tocart, axis=0).T, N, show_progress=True, num_threads=2).tocsr()

In [None]:
item_similarity_matrix_tocart.setdiag(0.)
item_similarity_matrix_tocart.eliminate_zeros()

In [None]:
item_similarity_matrix_tocart.shape

## Calculating the similarity of the recommended product

In [None]:
def score_item_based(data, item_similarity_matrix, user_item, name_score):
    pairs = data.dropna()[['user_cat', 'jointitem_cat']]
    pairs['user_cat'] = pairs['user_cat'].astype('uint32')
    pairs['jointitem_cat'] = pairs['jointitem_cat'].astype('uint32')
    pairs[name_score] = user_item[pairs['user_cat'].values].multiply(item_similarity_matrix[pairs['jointitem_cat'].values]).sum(axis = 1)
    data = pd.merge(data, pairs, on=['user_cat', 'jointitem_cat'], how = 'left').drop_duplicates()
    return data

### For training sample

In [None]:
df_train_view = score_item_based(ib_train_target_view, item_similarity_matrix_view, user_item_view, 'item_based_view')
df_train_tocart = score_item_based(ib_train_target_tocart, item_similarity_matrix_tocart, user_item_tocart,'item_based_tocart')

df_train_view = df_train_view.drop(columns = ['user_cat','jointitem_cat'])
df_train_tocart = df_train_tocart.drop(columns = ['user_cat','jointitem_cat'])
df_train_view.info()

In [None]:
train = pd.merge(train_target, df_train_view[['clientid', 'itemid', 'jointitemid', 'item_based_view']], how = 'left')
train = pd.merge(train, df_train_tocart[['clientid', 'itemid', 'jointitemid', 'item_based_tocart']], how = 'left')
train = train.fillna(0)
train.info()

### For test sample

In [None]:
df_test_view = score_item_based(ib_test_target_view, item_similarity_matrix_view, user_item_view, 'item_based_view')
df_test_tocart = score_item_based(ib_test_target_tocart, item_similarity_matrix_tocart, user_item_tocart,'item_based_tocart')

df_test_view = df_test_view.drop(columns = ['user_cat','jointitem_cat'])
df_test_tocart = df_test_tocart.drop(columns = ['user_cat','jointitem_cat'])


test_scores = pd.merge(test_target, df_test_view[['clientid','itemid','jointitemid','item_based_view']],how = 'left')
test_scores = pd.merge(test_scores,df_test_tocart[['clientid','itemid','jointitemid','item_based_tocart']],how = 'left')
test_scores = test_scores.fillna(0)
test_scores.info()

## Write to test and training file

In [None]:
test_scores.to_csv("../data/test_with_scores.csv", index = False)
train.to_csv("../data/train_with_scores.csv", index = False)