In [1]:
import pandas as pd
import numpy as np

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.metrics import precision_at_k, recall_at_k
from src.utils import pre_filter_items, train_test_split
from src.recommenders import MainRecommender, SecondLevelRecommender, DataTransformer

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('../data/retail_train.csv')
item_features = pd.read_csv('../data/product.csv')
user_features = pd.read_csv('../data/hh_demographic.csv')

In [3]:
data = pre_filter_items(data, item_features=item_features, take_n_popular=5000)

In [4]:
transformer = DataTransformer(data, user_features, item_features)

In [5]:
data_train_1, data_valid_1, data_train_2, data_valid_2 = train_test_split(data)

In [6]:
recommender = MainRecommender(data_train_1)

In [7]:
recommender.fit(n_factors=40)



In [8]:
result_1 = transformer.valid_items(data_valid_1, data_train_1)

In [9]:
result_1 = recommender.df_als_predictions(result_1)

Precision and recall evaluated at level 1 validation data

In [10]:
transformer.eval_recall_at_k(result_1, 'als_candidates')

0.3545502234053404

In [11]:
transformer.eval_precision_at_k(result_1, 'als_candidates')

0.14072963518240716

ALS precision at level 2 validation data

In [12]:
result_2 = transformer.valid_items(data_valid_2, data_train_1)
result_2 = recommender.df_als_predictions(result_2)
transformer.eval_precision_at_k(result_2, 'als_candidates')

0.11211477151965932

In [13]:
data = transformer.data

In [14]:
transformer.total_purchases

user_id
1        64
2        33
3        36
4        28
5        23
       ... 
2496     44
2497    164
2498    110
2499     56
2500     79
Name: total_purchases, Length: 2478, dtype: int64

In [18]:
users_lvl_2 = pd.DataFrame(result_1['user_id'].unique())
users_lvl_2.columns = ['user_id']

train_users = data_train_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_als_recommendations(x, 50))

s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['flag'] = 1

users_lvl_2.tail(4)

Unnamed: 0,user_id,item_id,flag
2000,2500,12808385,1
2000,2500,1122928,1
2000,2500,1114483,1
2000,2500,925854,1


In [20]:
data_train_lvl_2 = data_valid_1.copy()

In [21]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)

In [23]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,1,856942,1.0,159,GROCERY,National,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown
1,1,856942,1.0,159,GROCERY,National,BAKED BREAD/BUNS/ROLLS,FRUIT/BREAKFAST BREAD,16 OZ,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown


In [24]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2['target']

In [25]:
categorical = ['manufacturer',
               'department',
               'brand',
               'commodity_desc',
               'age_desc',
               'income_desc',
               'household_size_desc']

In [26]:
X_train[categorical] = X_train[categorical].astype('category')

In [28]:
recommender_second = SecondLevelRecommender(categorical)

In [30]:
# recommender_second.fit(X_train, y_train)

In [None]:
train_preds = recommender_second.predict(X_train)

In [None]:
train_preds

In [None]:
targets_lvl_2['preds'] = train_preds

In [None]:
targets_lvl_2.sort_values(['user_id', 'preds'], ascending=[True, False], inplace=True)

In [None]:
lgb_candidates = targets_lvl_2.groupby('user_id').head(5).groupby('user_id')['item_id'].unique().reset_index()

In [None]:
valid_lvl_2 = valid_lvl_2.merge(lgb_candidates, on='user_id', how='left')

In [None]:
valid_lvl_2.rename(columns={'item_id': 'lgb_candidates'}, inplace=True)

In [None]:
valid_lvl_2[valid_lvl_2.als_candidates.notna()].\
apply(lambda row: precision_at_k(row['lgb_candidates'], row['actual'], k=5), axis=1).mean()