### ДЗ_3. Подбор оптимальных гиперпараметров для ALS

- Попробуйте улучшить базовый вариант ALS, изменяя следующие параметры
  - regularization, iterations
  - factors
  - Вес (TF_IDF, BM25  взвешивание)
  
- Посчитайте метрики (Precision@5, MAP@5) для разных наборов гиперпараметров и выберете лучший набор

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight


import numpy as np
import pandas as pd
from matplotlib.pyplot import cm
import pickle

from scipy.spatial.distance import cdist

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import seaborn as sns

In [2]:
data = pd.read_csv('retail_train.csv')

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)


test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(3)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."


In [4]:
# Товары с частотой приобретения менее 5000 заменим ПУСТЫШКОЙ

popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)

top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()

In [5]:
# Заведем фиктивный item_id (если юзер покупал товары не из топ-5000, то он "купил" такой товар)
data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 999999

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробовать другие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15778533,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Введем словари-идентификаторы 

userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

In [85]:
def get_recommendations(user, model, sparse_user_item, N=5):
    res = [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[user], 
                                    user_items=sparse_user_item[userid_to_id[user]],   # на вход user-item matrix
                                    N=N, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)[0]]
    return res

In [86]:
def precision_at_k(recommended_list, bought_list, k=5):
    
    bought_list = np.array(bought_list)
    recommended_list = np.array(recommended_list)
    
    bought_list = bought_list  # Тут нет [:k] !!
    recommended_list = recommended_list[:k]
    
    flags = np.isin(recommended_list, bought_list)
    
    precision = flags.sum() / len(recommended_list)
    
    
    return precision


In [29]:
%%time

#  СОБЕРЕМ МОДЕЬ 

model = AlternatingLeastSquares(factors=100, 
                                regularization=0.001,
                                iterations=15, 
                                calculate_training_loss=True, 
                                num_threads=4,
                                random_state=42)

model.fit(sparse_user_item,  # На вход item-user matrix
          show_progress=True)

  0%|          | 0/15 [00:00<?, ?it/s]

CPU times: total: 19.9 s
Wall time: 2.29 s


In [107]:
def made_model(factors=100, regularization=0.001, iterations=15, model_in=sparse_user_item):
    model = AlternatingLeastSquares(factors=factors, 
                                    regularization=regularization,
                                    iterations=iterations, 
                                    calculate_training_loss=True, 
                                    num_threads=4,
                                    random_state=42)

    try:model.fit(model_in,  # На вход item-user matrix
              show_progress=False)
    except: print('--')
    
    try:
        result['als'] = result['user_id'].map(lambda x: get_recommendations(x, model, sparse_user_item, N=5))
        rrr = result.apply(lambda row: precision_at_k(row['als'], row['actual']), axis=1).mean()
    except:
        print('--')
        rrr = 0

    return rrr

In [30]:
%%time
    
result['als'] = result['user_id'].map(lambda x: get_recommendations(x, model, sparse_user_item, N=5))
result.apply(lambda row: precision_at_k(row['als'], row['actual']), axis=1).mean()

CPU times: total: 26.1 s
Wall time: 2.96 s


0.18050930460332765

In [80]:
tfidf_user_item_matrix = tfidf_weight(user_item_matrix).tocsr() # Применяется к item-user матрице ! 

In [81]:
bm25_user_item_matrix = bm25_weight(user_item_matrix.T).T.tocsr() # Применяется к item-user матрице ! 

In [100]:
factors = [1, 10, 50, 100, 150]
regularization = [0.001, 0.05]
iterations = [10 , 15, 30]
model = [sparse_user_item, tfidf_user_item_matrix, csr_matrix(bm25_user_item_matrix).tocsr()]

In [108]:
max_prec = 0
for z_model in model:
    for i_fact in factors:
        for j_reg in regularization:
            for k_iteration in iterations:
                print(f'model,  factors = {i_fact} ,  regularization = {j_reg},   iterations = {k_iteration}. precision_at_k = {made_model(i_fact, j_reg, k_iteration, z_model)}')

model,  factors = 1 ,  regularization = 0.001,   iterations = 10. precision_at_k = 0.2100881488736506
model,  factors = 1 ,  regularization = 0.001,   iterations = 15. precision_at_k = 0.2100881488736506
model,  factors = 1 ,  regularization = 0.001,   iterations = 30. precision_at_k = 0.2100881488736506
model,  factors = 1 ,  regularization = 0.05,   iterations = 10. precision_at_k = 0.2100881488736506
model,  factors = 1 ,  regularization = 0.05,   iterations = 15. precision_at_k = 0.2100881488736506
model,  factors = 1 ,  regularization = 0.05,   iterations = 30. precision_at_k = 0.2100881488736506
model,  factors = 10 ,  regularization = 0.001,   iterations = 10. precision_at_k = 0.19157688540646128
model,  factors = 10 ,  regularization = 0.001,   iterations = 15. precision_at_k = 0.19108716944172074
model,  factors = 10 ,  regularization = 0.001,   iterations = 30. precision_at_k = 0.1891283055827591
model,  factors = 10 ,  regularization = 0.05,   iterations = 10. precision_at_k

In [111]:
# Наилучшие параметры:
# model= tfidf_weight,  factors = 50 ,  regularization = 0.05,   iterations = 30. precision_at_k = 0.242605288932416