In [103]:
# !pip install implicit==0.4.4

In [None]:
# !pip install catboost

# Import libs

In [104]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [105]:
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks')


In [106]:
cd '/content/drive/MyDrive/Colab Notebooks'

/content/drive/MyDrive/Colab Notebooks


In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
import catboost as cb

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

## Read data

In [108]:
data = pd.read_csv('/content/drive/MyDrive/retail_train.csv')
item_features = pd.read_csv('/content/drive/MyDrive/product.csv')
user_features = pd.read_csv('/content/drive/MyDrive/hh_demographic.csv')

In [109]:
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


# Set global const

In [110]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'

# N = Neighbors
N_PREDICT = 50 

# Process features dataset

In [111]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

# Split dataset for train, eval, test

In [112]:
VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

In [113]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]

# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [114]:
# сделаем объединенный сет данных для первого уровня (матчинга)
df_join_train_matcher = pd.concat([data_train_matcher, data_val_matcher])

In [115]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [116]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 83685
val_matcher
Shape: (169711, 12) Users: 2154 Items: 27649
train_ranker
Shape: (169711, 12) Users: 2154 Items: 27649
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


# Prefilter items

In [117]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=900)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))


Decreased # items from 83685 to 901


# Make cold-start to warm-start

In [118]:
# ищем общих пользователей
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (861404, 13) Users: 2495 Items: 901
val_matcher
Shape: (169615, 12) Users: 2151 Items: 27644
train_ranker
Shape: (169615, 12) Users: 2151 Items: 27644
val_ranker
Shape: (118282, 12) Users: 2040 Items: 24325


# Init/train recommender

In [119]:
recommender = MainRecommender(data_train_matcher)  



  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/901 [00:00<?, ?it/s]

In [120]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [121]:
# N = Neighbors
N_PREDICT = 50

In [122]:
%%time
# для понятности расписано все в строчку, без функций, ваша задача уметь оборачивать все это в функции
result_eval_matcher['own_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
result_eval_matcher['sim_item_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_similar_items_recommendation(x, N=50))
result_eval_matcher['als_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_als_recommendations(x, N=50))

CPU times: user 18 s, sys: 12.4 s, total: 30.4 s
Wall time: 17 s


In [123]:
%%time
result_eval_matcher['sim_user_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_similar_users_recommendation(x, N=50))

CPU times: user 3min 25s, sys: 2min 44s, total: 6min 9s
Wall time: 3min 13s


In [124]:
def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [125]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

### Recall@50 of matching

In [126]:
TOPK_RECALL = 50

In [127]:
sorted(calc_recall(result_eval_matcher, TOPK_RECALL), key=lambda x: x[1],reverse=True)

[('own_rec', 0.07144021591646678),
 ('als_rec', 0.04653654329061527),
 ('sim_item_rec', 0.03950705739474679),
 ('sim_user_rec', 0.021927907333331557)]

### Precision@5 of matching

In [128]:
TOPK_PRECISION = 5

In [129]:
sorted(calc_precision(result_eval_matcher, TOPK_PRECISION), key=lambda x: x[1],reverse=True)

[('own_rec', 0.20120874012087206),
 ('sim_item_rec', 0.06843328684332883),
 ('sim_user_rec', 0.04611808461180879),
 ('als_rec', 0.03542538354253857)]

### Обучаем модель 2-ого уровня на выбранных кандидатах

## Подготовка данных для трейна

In [130]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [131]:
# собираем кандитатов с первого этапа (matcher)
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

In [132]:
df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[12810391, 1016800, 926905, 904129, 917033, 55..."
1,2021,"[863762, 1013928, 9835223, 6534077, 1025535, 9..."


In [133]:
# разворачиваем товары
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [134]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [135]:
df_match_candidates.head(4)

Unnamed: 0,user_id,item_id
0,2070,12810391
0,2070,1016800
0,2070,926905
0,2070,904129


### Check warm start

In [136]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (107550, 2) Users: 2151 Items: 898


In [137]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

df_ranker_train.head()

Unnamed: 0,user_id,item_id,target
2104867,2070,1019940,1
2107468,2021,840361,1
2107469,2021,856060,1
2107470,2021,869344,1
2107471,2021,896862,1


In [138]:
df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

# чистим дубликаты
df_ranker_train = df_ranker_train.drop_duplicates(subset=[USER_COL, ITEM_COL])

df_ranker_train['target'].fillna(0, inplace= True)

In [139]:
df_ranker_train.target.value_counts()

0.0    97147
1.0     9215
Name: target, dtype: int64

In [140]:
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target
0,2070,12810391,1.0
1,2070,1016800,0.0


## Подготавливаем фичи для обучения модели

### Описательные фичи

In [141]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [142]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [143]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,12810391,1.0,3452,MEAT,National,PORK,ENHANCED,,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1016800,0.0,103,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [144]:
df_ranker_train['manufacturer'].value_counts()

69      20579
2       10219
103      5404
1208     3701
397      2273
        ...  
754         9
407         8
289         6
516         3
2357        2
Name: manufacturer, Length: 288, dtype: int64

### Поведенческие фичи

In [145]:
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=[USER_COL, ITEM_COL]).agg('quantity').sum().rename('user_item_quantity_value'), how='left',on=[USER_COL, ITEM_COL])
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=[USER_COL, ITEM_COL]).agg(USER_COL).count().rename('user_item_user_freq'), how='left',on=[USER_COL, ITEM_COL])

In [146]:
df_ranker_train.head(3)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,hh_comp_desc,household_size_desc,kid_category_desc,total_item_sales_value,total_quantity_value,user_freq,total_user_sales_value,item_freq_per_basket,user_item_quantity_value,user_item_user_freq
0,2070,12810391,1.0,3452,MEAT,National,PORK,ENHANCED,,45-54,...,Unknown,1,None/Unknown,5954.24,363,1996,5754.86,0.001334,8.0,6.0
1,2070,1016800,0.0,103,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,45-54,...,Unknown,1,None/Unknown,2234.13,745,1996,5754.86,0.001835,10.0,5.0
2,2070,926905,0.0,103,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,45-54,...,Unknown,1,None/Unknown,2420.71,786,1996,5754.86,0.002255,9.0,5.0


In [147]:
pur_week = data.copy()
pur_week['week'] = pur_week['day'] // 7 +1
pur_week.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,week
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,1
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,1
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0,1
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0,1
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0,1


*Среднее Кол-во покупок в неделю, средняя сумма покупок в неделю за товар*

In [148]:
pur_week_item = pur_week.groupby(['item_id', 'week']).agg({'quantity': 'sum', 'sales_value': 'sum'}).rename(columns={"quantity": "count_item", "sales_value": "sum_item"})
pur_week_item = pur_week_item.groupby(['item_id']).agg({'count_item': 'mean', 'sum_item': 'mean'})
df_ranker_train = pd.merge(df_ranker_train, pur_week_item, on= 'item_id', how='left')

*Список купленных товаров*

In [149]:
pur=df_ranker_train.loc[df_ranker_train['target'] == 1, ['user_id', 'item_id']]

In [150]:
pur = pur.groupby(['user_id']).agg({'item_id': lambda x: list(x)}).rename(columns={"item_id": "purchases"})

In [151]:
df_ranker_train= pd.merge(df_ranker_train, pur, on= 'user_id', how='left')

## Обучение модели ранжирования

In [152]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [153]:
categorical = X_train.dtypes[X_train.dtypes == "object"].index.tolist()
X_train[categorical] = X_train[categorical].astype(str)

In [155]:
params = {'silent': True,
          'objective' : 'CrossEntropy',
          "eval_metric": "Precision",
          'n_estimators': 650,
          "max_depth": 6

} 


model = cb.CatBoostClassifier(**params)
model.fit(X_train, y_train, categorical)

<catboost.core.CatBoostClassifier at 0x7fcd327f9050>

Оптимизация


In [156]:
# !pip install bayesian-optimization

In [158]:
# from bayes_opt import BayesianOptimization

In [None]:
params = {
    'silent': True,
          'objective' : 'CrossEntropy',
          "eval_metric": "Precision",
          'n_estimators': 650,
          "max_depth": 6
}

In [None]:
def optimize_params(learning_rate):
    X_train = df_ranker_train.drop('target', axis=1)
    y_train = df_ranker_train[['target']]
    categorical = X_train.dtypes[X_train.dtypes == "object"].index.tolist()
    X_train[categorical] = X_train[categorical].astype(str)
    # params["max_depth"] = int(max_depth)
    # params["min_data_in_leaf"] = int(min_data_in_leaf)
    params["learning_rate"] = learning_rate
    # params['n_estimators'] = int(n_estimators)
    model = cb.CatBoostClassifier(**params)
    model.fit(X_train, y_train, categorical)

    train_preds = model.predict_proba(X_train)
    df_ranker_predict = df_ranker_train.copy()
    df_ranker_predict['proba_item_purchase'] = train_preds[:,1]
    result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
    result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
    result_eval_ranker.head(2)
    result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))
    res =sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)
    return res[0][1]
    

In [None]:
params_search = {

    "learning_rate" : (0.01, 0.5)
    
}

cat_params_search = BayesianOptimization(
    optimize_params,
    pbounds=params_search,
    random_state=27
)
cat_params_search.maximize(
    init_points=3, n_iter=5, acq='ei'
)

In [None]:
optimal_cat_params =  cat_params_search.max

In [None]:
optimal_cat_params

In [None]:
#  !pip install optuna

In [None]:
import optuna

In [None]:
def objective(trial):
    X_train = df_ranker_train.drop('target', axis=1)
    y_train = df_ranker_train[['target']]
    categorical = X_train.dtypes[X_train.dtypes == "object"].index.tolist()
    X_train[categorical] = X_train[categorical].astype(str)
    param = {
          'objective' : 'CrossEntropy',
          "eval_metric": "Precision",
          'n_estimators': 524,
           "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1)
       
    }

   
    model = cb.CatBoostClassifier(**param)
    model.fit(X_train, y_train, categorical)

    train_preds = model.predict_proba(X_train)
    df_ranker_predict = df_ranker_train.copy()
    df_ranker_predict['proba_item_purchase'] = train_preds[:,1]
    train_preds = model.predict_proba(X_train)
    df_ranker_predict = df_ranker_train.copy()
    df_ranker_predict['proba_item_purchase'] = train_preds[:,1]
    result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
    result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
    result_eval_ranker.head(2)
    result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))
    res =sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)
    return res[0][1]

In [None]:
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=10)

In [159]:
train_preds = model.predict_proba(X_train)
df_ranker_predict = df_ranker_train.copy()
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

*Подбор фичей*


In [None]:
from sklearn.inspection import permutation_importance

importance = permutation_importance(
    model, X_train,  y_train, scoring="precision", n_jobs=-1, random_state=27
)

In [None]:
importance_scores = pd.DataFrame({
    "features": X_train.columns,
    "importance-mean": importance.importances_mean,
    "importance-std": importance.importances_std,
})
importance_scores = importance_scores.sort_values(
    by="importance-mean", ascending=False
)
importance_scores = importance_scores.reset_index(drop=True)
decrease_scores = importance_scores[importance_scores["importance-mean"]<=0]
decrease_scores = decrease_scores.reset_index(drop=True)
decrease_scores

# Evaluation on test dataset

In [160]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


## Eval matching on test dataset

In [161]:
%%time
result_eval_ranker['own_rec'] = result_eval_ranker[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

CPU times: user 2.72 s, sys: 41.8 ms, total: 2.76 s
Wall time: 2.75 s


In [162]:
sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('own_rec', 0.1675490196078412)]

## Eval re-ranked matched result on test dataset

In [163]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [166]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [167]:
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.28887728459529854)
('own_rec', 0.1675490196078412)


  return flags.sum() / len(recommended_list)


# Оценка на тесте для выполнения курсового проекта

In [168]:
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/retail_test1.csv')

In [169]:
df_test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [170]:
result_test = df_test.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_test.columns=[USER_COL, ACTUAL_COL]
result_test.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


In [171]:
result_test['reranked_own_rec'] = result_test[USER_COL].apply(lambda user_id: rerank(user_id))

In [172]:
print(*sorted(calc_precision(result_test, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.23513513513513293)


  return flags.sum() / len(recommended_list)


('reranked_own_rec', 0.23513513513513293)
