In [1]:
import pandas as pd
import numpy as np

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.metrics import precision_at_k, recall_at_k
from src.utils import pre_filter_items, train_test_split
from src.recommenders import MainRecommender, SecondLevelRecommender, DataTransformer

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('../data/retail_train.csv')
item_features = pd.read_csv('../data/product.csv')
user_features = pd.read_csv('../data/hh_demographic.csv')

In [3]:
data = pre_filter_items(data, item_features=item_features, take_n_popular=5000)

In [4]:
transformer = DataTransformer(data, user_features, item_features)

In [5]:
transformer.transform()

In [6]:
data = transformer.data

In [7]:
data

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,month,weekend
0,2375,26984851516,1,1085983,1,2.99,364,-0.40,1642,1,...,18 OZ,,,,,,,,1,False
1,1364,26984896261,1,937406,1,2.50,31742,-0.99,1520,1,...,12OZ,65+,B,100-124K,Homeowner,Single Female,1,None/Unknown,1,False
2,1172,26985025264,1,1000493,1,4.44,396,-0.89,946,1,...,,25-34,B,50-74K,Unknown,Single Male,1,None/Unknown,1,False
3,1172,26985025264,1,1075214,1,5.99,396,-3.00,946,1,...,430.8 SQFT,25-34,B,50-74K,Unknown,Single Male,1,None/Unknown,1,False
4,1172,26985025264,1,5569230,1,2.33,396,-2.26,946,1,...,12 OZ,25-34,B,50-74K,Unknown,Single Male,1,None/Unknown,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620584,1754,41653239425,663,1101010,1,3.02,343,-0.50,1609,95,...,,,,,,,,,4,False
620585,1754,41653239425,663,5569230,4,10.00,343,-8.76,1609,95,...,12 OZ,,,,,,,,4,False
620586,2078,41653241879,663,871570,1,2.50,343,-1.29,2129,95,...,16 OZ,,,,,,,,4,False
620587,2078,41653241879,663,1074754,1,2.68,343,0.00,2129,95,...,18 OZ,,,,,,,,4,False


In [10]:
categorical = transformer.categorical

In [9]:
data_train_1, data_valid_1, data_train_2, data_valid_2 = train_test_split(data)

In [11]:
recommender = MainRecommender(data_train_1)

In [12]:
recommender.fit(n_factors=40)



In [13]:
result_1 = transformer.valid_items(data_valid_1, data_train_1)

In [14]:
result_1 = recommender.df_als_predictions(result_1)

Precision and recall evaluated at level 1 validation data

In [17]:
transformer.eval_recall_at_k(result_1, 'als_candidates')

0.3575070235766455

In [18]:
transformer.eval_precision_at_k(result_1, 'als_candidates')

0.14442778610694487

In [19]:
users_lvl_2 = pd.DataFrame(data_train_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

train_users = data_train_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, 200))

s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['flag'] = 1

Level 2 dataframe

In [26]:
targets_lvl_2 = data_train_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)

targets_lvl_2 = targets_lvl_2.merge(transformer.item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(transformer.user_features, on='user_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(transformer.purchases_in_category,  on=['user_id', 'commodity_desc'], how='left')
targets_lvl_2.purchases_in_category = targets_lvl_2.purchases_in_category.fillna(0)

targets_lvl_2

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,weekend_purchases_ratio,user_avg_basket_price,purchases_per_month,purchases_in_category
0,2021,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,,,,,,,,,,,0.0
1,2021,1106523,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,,,,,,,,,,,0.0
2,2021,1013928,0.0,6046,DRUG GM,National,PREPAID WIRELESS&ACCESSORIES,PREPAID WIRELESS CARDS,,,,,,,,,,,,8.0
3,2021,1097398,0.0,111,DRUG GM,National,CIGARETTES,CIGARETTES,CTN,,,,,,,,,,,4.0
4,2021,5569230,0.0,1208,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,,,,,,,,,,,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403491,227,995478,0.0,1646,PRODUCE,National,SALAD MIX,SALAD SPINACH,6 OZ,,,,,,,,,,,0.0
403492,227,10149640,0.0,1011,GROCERY,National,BATH TISSUES,TOILET TISSUE,,,,,,,,,,,,0.0
403493,227,965842,0.0,5817,PRODUCE,National,TOMATOES,TOMATOES CHERRY,12 OZ,,,,,,,,,,,0.0
403494,227,825994,0.0,3126,PRODUCE,National,VALUE ADDED FRUIT,INSTORE CUT FRUIT,,,,,,,,,,,,0.0


In [27]:
transformer.user_features

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id,weekend_purchases_ratio,user_avg_basket_price,purchases_per_month
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,Unknown,1,0.0,25.112969,16.00
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,Unknown,7,0.0,23.442326,10.75
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8,0.0,20.305402,21.75
3,25-34,U,75-99K,Homeowner,2 Adults Kids,4,2,13,0.0,23.554480,31.25
4,45-54,B,50-74K,Homeowner,Single Female,1,Unknown,16,0.0,3.995000,2.00
...,...,...,...,...,...,...,...,...,...,...,...
796,35-44,U,50-74K,Homeowner,2 Adults No Kids,2,Unknown,2494,0.0,24.751250,6.00
797,45-54,A,75-99K,Homeowner,Unknown,3,1,2496,0.0,38.765000,11.00
798,45-54,U,35-49K,Unknown,Single Male,1,Unknown,2497,0.0,19.357988,41.00
799,25-34,U,50-74K,Homeowner,2 Adults No Kids,2,Unknown,2498,0.0,9.943545,27.50


In [36]:
targets_lvl_2 = data_train_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)

In [38]:
targets_lvl_2

Unnamed: 0,user_id,item_id,target
0,2021,1029743,0.0
1,2021,1106523,0.0
2,2021,1013928,0.0
3,2021,1097398,0.0
4,2021,5569230,0.0
...,...,...,...
403491,227,995478,0.0
403492,227,10149640,0.0
403493,227,965842,0.0
403494,227,825994,0.0


In [24]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2['target']

In [25]:
categorical = ['manufacturer',
               'department',
               'brand',
               'commodity_desc',
               'age_desc',
               'income_desc',
               'household_size_desc']

In [26]:
X_train[categorical] = X_train[categorical].astype('category')

In [28]:
recommender_second = SecondLevelRecommender(categorical)

In [30]:
# recommender_second.fit(X_train, y_train)

In [None]:
train_preds = recommender_second.predict(X_train)

In [None]:
train_preds

In [None]:
targets_lvl_2['preds'] = train_preds

In [None]:
targets_lvl_2.sort_values(['user_id', 'preds'], ascending=[True, False], inplace=True)

In [None]:
lgb_candidates = targets_lvl_2.groupby('user_id').head(5).groupby('user_id')['item_id'].unique().reset_index()

In [None]:
valid_lvl_2 = valid_lvl_2.merge(lgb_candidates, on='user_id', how='left')

In [None]:
valid_lvl_2.rename(columns={'item_id': 'lgb_candidates'}, inplace=True)

In [None]:
valid_lvl_2[valid_lvl_2.als_candidates.notna()].\
apply(lambda row: precision_at_k(row['lgb_candidates'], row['actual'], k=5), axis=1).mean()