# Данные
http://jmcauley.ucsd.edu/data/amazon/links.html, SMALL выборка из категории Amazon Fashion	
- reviewerID - ID of the reviewer, e.g. A2SUAM1J3GNN3B
- asin - ID of the product, e.g. 0000013714
- reviewerName - name of the reviewer
- vote - helpful votes of the review
- style - a disctionary of the product metadata, e.g., "Format" is "Hardcover"
- reviewText - text of the review
- overall - rating of the product
- summary - summary of the review
- unixReviewTime - time of the review (unix time)
- reviewTime - time of the review (raw)
- image - images that users post after they have received the product

In [115]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
from scipy.sparse.linalg import spsolve
import warnings
warnings.filterwarnings("ignore")

# Предобработка

In [112]:
df = pd.read_json("datasets/23/AMAZON_FASHION_5.json", lines=True)
df = df.loc[df.astype(str).drop_duplicates().index]
print(df.shape)
df.head(3)

(3108, 12)


Unnamed: 0,asin,image,overall,reviewText,reviewTime,reviewerID,reviewerName,style,summary,unixReviewTime,verified,vote
0,B000K2PJ4K,,5,Great product and price!,"09 4, 2015",ALJ66O1Y6SLHA,Tonya B.,"{'Size:': ' Big Boys', 'Color:': ' Blue/Orange'}",Five Stars,1441324800,True,
1,B000K2PJ4K,,5,Great product and price!,"09 4, 2015",ALJ66O1Y6SLHA,Tonya B.,"{'Size:': ' Big Boys', 'Color:': ' Black (3746...",Five Stars,1441324800,True,
2,B000K2PJ4K,,5,Great product and price!,"09 4, 2015",ALJ66O1Y6SLHA,Tonya B.,"{'Size:': ' Big Boys', 'Color:': ' Blue/Gray L...",Five Stars,1441324800,True,


In [113]:
grouped_purchased = df.groupby(["reviewerID", "asin"]).size().to_frame("Quantity").reset_index()
grouped_purchased.rename(columns={'reviewerID': 'CustomerID', 'asin': 'StockCode'}, inplace=True)
grouped_purchased.head(3)

Unnamed: 0,CustomerID,StockCode,Quantity
0,A10RXRZE0TAKPU,B0014F7B98,1
1,A10RXRZE0TAKPU,B001IKJOLW,1
2,A10RXRZE0TAKPU,B0058YEJ5K,1


In [70]:
customers = list(np.sort(grouped_purchased["CustomerID"].unique())) 
products = list(grouped_purchased["StockCode"].unique()) 
quantity = list(grouped_purchased["Quantity"])

rows = grouped_purchased["CustomerID"].astype('category').cat.codes 
cols = grouped_purchased["StockCode"].astype('category').cat.codes 
purchases_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(customers), len(products)))

len(customers), len(products), len(quantity)

(406, 31, 3042)

In [71]:
matrix_size = purchases_sparse.shape[0]*purchases_sparse.shape[1] 
num_purchases = len(purchases_sparse.nonzero()[0]) 
sparsity = 100*(1 - (num_purchases/matrix_size))
sparsity

75.83028762116638

# Моделирование (base-line)

In [114]:
import random
import pickle
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, auc_score

In [73]:
def make_train(ratings, pct_test = 0.2):
    test_set = ratings.copy()
    test_set[test_set != 0] = 1
    training_set = ratings.copy() 
    nonzero_inds = training_set.nonzero() 
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) 
    random.seed(0) 
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) 
    samples = random.sample(nonzero_pairs, num_samples) 
    user_inds = [index[0] for index in samples] 
    item_inds = [index[1] for index in samples] 
    training_set[user_inds, item_inds] = 0 
    training_set.eliminate_zeros() 
    return training_set, test_set, list(set(user_inds))  

In [74]:
product_train, product_test, product_users_altered = make_train(purchases_sparse, pct_test = 0.2)

In [116]:
model = LightFM(loss='warp')
model.fit_partial(product_train, epochs=40, num_threads=2)

# with open('saved_model','wb') as f:
#             saved_model={'model':model}
#             pickle.dump(saved_model, f)


train_auc = auc_score(model, product_train).mean()
test_auc = auc_score(model, product_test).mean()

print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

AUC: train 1.00, test 0.95.


# Рекомендация

In [77]:
grouped_purchased.head(1)

Unnamed: 0,CustomerID,StockCode,Quantity
0,A10RXRZE0TAKPU,B0014F7B98,1


In [79]:
ziped = np.array(list(zip(grouped_purchased.StockCode.astype('category'), cols)))
mapping = pd.Series(ziped[:, 0], index=ziped[:, 1].astype('int32')).drop_duplicates()

In [80]:
mapping.head()

6     B0014F7B98
11    B001IKJOLW
14    B0058YEJ5K
15    B005AGO4LU
16    B0092UF54A
dtype: object

In [117]:
# Формируем таблицу товаров. К сожалению у нас нет описания товара, поэтому берем reviewText
item_lookup = df[['asin', 'reviewerID', 'reviewText']].drop_duplicates()
item_lookup.index = item_lookup.asin
item_lookup = item_lookup[['reviewText']]
item_lookup.head(3)

Unnamed: 0_level_0,reviewText
asin,Unnamed: 1_level_1
B000K2PJ4K,Great product and price!
B000K2PJ4K,Waaay too small. Will use for futur children!
B000K2PJ4K,Stays vibrant after many washes


In [121]:
model.item_embeddings.shape

(31, 10)

In [122]:
from sklearn.metrics.pairwise import cosine_similarity

def display_item_to_items_recommendations(model, item_id):

    products_arr = np.array(products) 
    
    item_id = np.where(products_arr == item_id)[0][0]
    print(item_id)
  
    return item_lookup['reviewText'][cosine_similarity( # первый элемент массива?
            model.item_embeddings)[item_id].argsort()[-5:][::-1]]



# Рекомендалка в деле 

In [123]:
display_item_to_items_recommendations(model, 'B000YFSR4W')

12


asin
B000V0IBDM    Relieved my Plantar Fascitis for 3 Days. Then ...
B000K2PJ4K          Waaay too small. Will use for future child.
B000K2PJ4K    My son really likes the pink. Ones which I was...
B000YFSR5G    Good product for the price.  Used very day and...
B000YFSR5G    Fit perfectly. I bought dark grey, and they di...
Name: reviewText, dtype: object

В итоге, к товару B000YFSR4W получили рекомендация купить другие товары с идентификаторами B000V0IBDM, B000K2PJ4K, B000YFSR5G etc. Из датасета мы не знаем какие это товары. Стоит повторить эксперимент с названиями и описаниями товаров