In [1]:
import pandas as pd
import numpy as np

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.metrics import precision_at_k, recall_at_k
from src.utils import pre_filter_items
from src.recommenders import MainRecommender, SecondLevelRecommender, DataTransformer

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('../data/retail_train.csv')
item_features = pd.read_csv('../data/product.csv')
user_features = pd.read_csv('../data/hh_demographic.csv')

In [3]:
data = pre_filter_items(data, item_features=item_features, take_n_popular=5000)

In [4]:
transformer = DataTransformer(data, user_features, item_features)

In [5]:
data_train_1, data_valid_1, data_train_2, data_valid_2 = transformer.train_test_split()

In [6]:
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [7]:
recommender = MainRecommender(data_train_1)

In [8]:
recommender.fit(n_factors=40)



In [9]:
result_1 = transformer.valid_items(data_valid_1, data_train_1)

In [10]:
result_1 = recommender.df_als_predictions(result_1)

Precision and recall evaluated at level 1 validation data

In [11]:
transformer.eval_recall_at_k(result_1, 'als_candidates')

0.36191916549835906

In [12]:
transformer.eval_precision_at_k(result_1, 'als_candidates')

0.14342828585706982

In [9]:
als_candidates = result_lvl_1[['user_id', 'als_candidates']]

In [10]:
valid_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index().rename(columns={'item_id': 'actual'})

In [11]:
valid_lvl_2 = valid_lvl_2.merge(als_candidates, on='user_id', how='left')

In [12]:
valid_lvl_2[valid_lvl_2.als_candidates.notna()].\
apply(lambda row: precision_at_k(row['als_candidates'], row['actual'], k=5), axis=1).mean()

0.12100643651257993

In [13]:
prices = data.groupby('item_id')['sales_value'].max()

item_features = item_features.merge(prices, on='item_id', how='left').rename(columns={'sales_value': 'price'})

avg_price_by_cat = item_features.groupby('commodity_desc')['price'].mean()
item_features = item_features.merge(avg_price_by_cat, on='commodity_desc', how='left').rename(columns={'price_x': 'price', 'price_y': 'avg_price'})

item_features_test = item_features[['item_id',
                                    'manufacturer',
                                    'department',
                                    'brand',
                                    'commodity_desc',
                                    'price',
                                    'avg_price']]



In [14]:
basket_price = data.groupby('basket_id')['sales_value'].sum()

In [15]:
data = data.merge(basket_price, on='basket_id', how='left').rename(columns={'sales_value_x': 'sales_value', 'sales_value_y': 'basket_price'})

In [16]:
avg_basket_per_user = data.groupby('user_id').basket_price.mean()

In [17]:
user_features = user_features.merge(avg_basket_per_user, on='user_id', how='left')

In [18]:
x = data.groupby(['user_id', 'week_no']).basket_id.count().reset_index().groupby('user_id').agg(['count', 'sum'])
x.columns = x.columns.droplevel(0)
avg_purchases_per_week = x.iloc[:, 2:].apply(lambda row: row['sum']/row['count'], axis=1)
avg_purchases_per_week.name = 'avg_purchases_per_week'
user_features = user_features.merge(avg_purchases_per_week, on='user_id', how='left')

user_features_test = user_features[['user_id',
                                    'age_desc',
                                    'income_desc',
                                    'household_size_desc',
                                    'avg_purchases_per_week',
                                    'basket_price']]

In [19]:
users_lvl_2 = pd.DataFrame(result_lvl_1['user_id'].unique())
users_lvl_2.columns = ['user_id']

train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_als_recommendations(x, 50))

s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['flag'] = 1

users_lvl_2.tail(4)

Unnamed: 0,user_id,item_id,flag
2000,2500,9676938,1
2000,2500,9677100,1
2000,2500,1029688,1
2000,2500,823576,1


In [20]:
data_train_lvl_2 = data_val_lvl_1.copy()

In [21]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)

In [22]:
targets_lvl_2 = targets_lvl_2.merge(item_features_test, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features_test, on='user_id', how='left')

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,price,avg_price,age_desc,income_desc,household_size_desc,avg_purchases_per_week,basket_price
0,1,856942,1.0,159,GROCERY,National,BAKED BREAD/BUNS/ROLLS,15.0,7.213012,65+,35-49K,2,7.548387,32.268932
1,1,856942,1.0,159,GROCERY,National,BAKED BREAD/BUNS/ROLLS,15.0,7.213012,65+,35-49K,2,7.548387,32.268932


In [23]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2['target']

In [24]:
categorical = ['manufacturer',
               'department',
               'brand',
               'commodity_desc',
               'age_desc',
               'income_desc',
               'household_size_desc']

In [25]:
X_train[categorical] = X_train[categorical].astype('category')

In [26]:
recommender_second = SecondLevelRecommender(X_train, y_train, categorical)

In [27]:
recommender_second.fit()

In [28]:
train_preds = recommender_second.predict(X_train)

In [29]:
train_preds

array([0.43614854, 0.43614854, 0.21786214, ..., 0.00917355, 0.05676807,
       0.07600654])

In [30]:
targets_lvl_2['preds'] = train_preds

In [31]:
targets_lvl_2.sort_values(['user_id', 'preds'], ascending=[True, False], inplace=True)

In [32]:
lgb_candidates = targets_lvl_2.groupby('user_id').head(5).groupby('user_id')['item_id'].unique().reset_index()

In [33]:
valid_lvl_2 = valid_lvl_2.merge(lgb_candidates, on='user_id', how='left')

In [34]:
valid_lvl_2.rename(columns={'item_id': 'lgb_candidates'}, inplace=True)

In [35]:
valid_lvl_2[valid_lvl_2.als_candidates.notna()].\
apply(lambda row: precision_at_k(row['lgb_candidates'], row['actual'], k=5), axis=1).mean()

0.19448020284766787