Simple model using approach described here: https://developers.google.com/machine-learning/recommendation/collaborative/matrix

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import os
import datetime

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [3]:
# os.chdir('..')
os.chdir('drive/My Drive/Colab Notebooks/Github/fashion-recommendations') 

In [4]:
from fashion_recommendations.metrics.average_precision import mapk

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [6]:
train_set_orig = pd.read_csv('data/splits/train_subset.tsv', sep='\t', dtype={'article_id': str})
print(train_set_orig.shape)
train_set_orig.head()

(588758, 5)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,016d8f0519d9e0572b3abebeab87408bad7a5c3a284016...,650193004,0.050831,1
3,2018-09-20,016d8f0519d9e0572b3abebeab87408bad7a5c3a284016...,527687006,0.101678,1
4,2018-09-20,02bfe1a5248beb9cd28ad4ac630a6d75e78d9a3e14551a...,668767002,0.016932,2


In [7]:
test_set = pd.read_csv('data/splits/test_subset.tsv', sep='\t', dtype={'article_id': str})
print(test_set.shape)
test_set.head()

(32995, 5)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2020-09-15,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,794321007,0.061,2
1,2020-09-15,0015f37f752a41a75c3be6f3f92deedc4c87d039f1758e...,778745010,0.033881,2
2,2020-09-15,0015f37f752a41a75c3be6f3f92deedc4c87d039f1758e...,778745010,0.033881,2
3,2020-09-15,0015f37f752a41a75c3be6f3f92deedc4c87d039f1758e...,610776105,0.008458,2
4,2020-09-15,0015f37f752a41a75c3be6f3f92deedc4c87d039f1758e...,929745001,0.050831,2


### Map `customer_id` and `article_id` to indices

Test set doesn't include anyone not in training set

In [8]:
unique_customers_df = train_set_orig[['customer_id']].drop_duplicates().reset_index(drop=True)

customer_id_to_idx = dict(
    zip(
        unique_customers_df['customer_id'],
        unique_customers_df.index
    )
)

len(customer_id_to_idx)

9208

In [9]:
articles_df = pd.read_csv(
    'data/articles.csv', 
    dtype={'article_id': str}, 
    usecols=['article_id', 'prod_name', 'product_type_name', 'product_group_name', 'detail_desc']
)
print(articles_df.shape)
articles_df.head()

(105542, 5)


Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,detail_desc
0,108775015,Strap top,Vest top,Garment Upper body,Jersey top with narrow shoulder straps.
1,108775044,Strap top,Vest top,Garment Upper body,Jersey top with narrow shoulder straps.
2,108775051,Strap top (1),Vest top,Garment Upper body,Jersey top with narrow shoulder straps.
3,110065001,OP T-shirt (Idro),Bra,Underwear,"Microfibre T-shirt bra with underwired, moulde..."
4,110065002,OP T-shirt (Idro),Bra,Underwear,"Microfibre T-shirt bra with underwired, moulde..."


In [10]:
article_id_to_idx = dict(
    zip(
        articles_df['article_id'],
        articles_df.index
    )
)

In [11]:
train_set_orig['customer_id_idx'] = train_set_orig['customer_id'].map(customer_id_to_idx)
train_set_orig['article_id_idx'] = train_set_orig['article_id'].map(article_id_to_idx)

train_set_orig.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,customer_id_idx,article_id_idx
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,0,40179
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,0,10520
2,2018-09-20,016d8f0519d9e0572b3abebeab87408bad7a5c3a284016...,650193004,0.050831,1,1,36302
3,2018-09-20,016d8f0519d9e0572b3abebeab87408bad7a5c3a284016...,527687006,0.101678,1,1,8963
4,2018-09-20,02bfe1a5248beb9cd28ad4ac630a6d75e78d9a3e14551a...,668767002,0.016932,2,2,41390


### Create development set

In [12]:
train_set_orig['t_dat'] = pd.to_datetime(train_set_orig['t_dat'])

In [13]:
end = train_set_orig['t_dat'].max()
endm1 = end - datetime.timedelta(days=7)

endm1, end

(Timestamp('2020-09-07 00:00:00'), Timestamp('2020-09-14 00:00:00'))

In [14]:
train_set_orig[train_set_orig['t_dat'] < endm1]['customer_id'].nunique()

9148

In [15]:
train_set_orig[train_set_orig['t_dat'] >= endm1]['customer_id'].nunique()

1998

In [16]:
# Remove customers from dev set who are not in the new training set
customers_to_remove = (
    set(train_set_orig[train_set_orig['t_dat'] >= endm1]['customer_id']) - 
    set(train_set_orig[train_set_orig['t_dat'] < endm1]['customer_id'])
)

In [17]:
len(customers_to_remove)

60

In [18]:
train_set = train_set_orig.copy()[train_set_orig['t_dat'] < endm1]

dev_set = train_set_orig.copy()[train_set_orig['t_dat'] >= endm1]

train_set.shape, dev_set.shape

((580395, 7), (8363, 7))

In [19]:
dev_set = dev_set.copy()[~dev_set['customer_id'].isin(customers_to_remove)]
print(dev_set.shape)
dev_set.head()

(8131, 7)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,customer_id_idx,article_id_idx
580395,2020-09-07,00639e775b90554156986100685c4d408723c77e411e37...,891591007,0.084729,2,732,100648
580396,2020-09-07,00639e775b90554156986100685c4d408723c77e411e37...,891591001,0.084729,2,732,100645
580397,2020-09-07,016f3b7c2b7652870b4b2dbdcb1805a7c8c262036f5511...,909924004,0.033881,2,1057,103196
580398,2020-09-07,016f3b7c2b7652870b4b2dbdcb1805a7c8c262036f5511...,921266005,0.016932,2,1057,104310
580399,2020-09-07,016f3b7c2b7652870b4b2dbdcb1805a7c8c262036f5511...,857163001,0.013542,2,1057,93169


In [20]:
dev_set_by_customer = dev_set.groupby('customer_id').apply(lambda x: list(x['article_id_idx'])).reset_index().rename(columns={0: 'article_id_idx'})
dev_set_by_customer.head()

Unnamed: 0,customer_id,article_id_idx
0,00462904b288681a9facb555f75dd2cf4d0f730a6e0ea7...,[102443]
1,0054c50274d19af58d53ef3ce0c004bea446c80bd51cf2...,"[72000, 16023, 71167]"
2,00639e775b90554156986100685c4d408723c77e411e37...,"[100648, 100645, 100645]"
3,00798bd464457d23d6af401715fe32d5c676ad9ee4010d...,[103885]
4,0099238196d8f71659fceaa115b36e400398bcfc169b5f...,"[95789, 95789, 3091, 99184, 99184, 93416, 9341..."


In [21]:
customer_id_idx_ordered = dev_set_by_customer['customer_id'].map(customer_id_to_idx).tolist()

In [22]:
dev_actuals = dev_set_by_customer['article_id_idx'].to_list()

### Define loss function

In [23]:
labels = torch.tensor([1, 1, 0, 0])
preds = torch.tensor([0.9, 0.2, 0.3, 0.4])

In [24]:
obs_mask = torch.where(labels == 1)
obs_mask

(tensor([0, 1]),)

In [25]:
unobs_mask = torch.where(labels == 0)
unobs_mask

(tensor([2, 3]),)

In [26]:
w_0 = 2

In [27]:
obs_loss = ((labels[obs_mask] - preds[obs_mask])**2).sum()
obs_loss

tensor(0.6500)

In [28]:
unobs_loss = ((labels[unobs_mask] - preds[unobs_mask])**2).sum()
unobs_loss

tensor(0.2500)

In [29]:
total_loss = obs_loss + w_0 * unobs_loss

In [30]:
loss = total_loss / len(labels)
loss

tensor(0.2875)

In [31]:
def weighted_matrix_factorisation_loss(preds, labels, w_0=1):
    obs_mask = torch.where(labels == 1)

    unobs_mask = torch.where(labels == 0)

    obs_loss = ((labels[obs_mask] - preds[obs_mask])**2).sum()

    unobs_loss = (preds[unobs_mask]**2).sum()

    total_loss = obs_loss + w_0 * unobs_loss

    loss = total_loss / len(labels)
    
    return loss

In [32]:
weighted_matrix_factorisation_loss(preds, labels, w_0=2)

tensor(0.2875)

### Format data

Since all the users in the test said are included in training we utilise the full training set.

In [33]:
train_set.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,customer_id_idx,article_id_idx
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,0,40179
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,0,10520
2,2018-09-20,016d8f0519d9e0572b3abebeab87408bad7a5c3a284016...,650193004,0.050831,1,1,36302
3,2018-09-20,016d8f0519d9e0572b3abebeab87408bad7a5c3a284016...,527687006,0.101678,1,1,8963
4,2018-09-20,02bfe1a5248beb9cd28ad4ac630a6d75e78d9a3e14551a...,668767002,0.016932,2,2,41390


- Keep distinct purchases only
- Discard date, price and channel columns

In [34]:
endm1

Timestamp('2020-09-07 00:00:00')

In [36]:
train_set = train_set.copy()[train_set['t_dat'] > endm1 - datetime.timedelta(days=7*52)]

In [64]:
train_set_processed = train_set.copy()[['customer_id_idx', 'article_id_idx']].drop_duplicates(subset=['customer_id_idx', 'article_id_idx'])
train_set_processed.shape

(257390, 2)

In [65]:
train_set_processed.head()

Unnamed: 0,customer_id_idx,article_id_idx
277801,6410,70955
277802,3335,7280
277803,3335,65588
277804,3335,65587
277805,4475,65002


For each customer sample up to X non-purchased items to use as negative samples

In [66]:
purchases_by_customer_id_idx = train_set_processed.groupby('customer_id_idx')['article_id_idx'].apply(lambda x: list(x)).reset_index()
purchases_by_customer_id_idx.head()

Unnamed: 0,customer_id_idx,article_id_idx
0,0,"[59458, 1469, 60253, 60259, 93585, 91841, 6727..."
1,1,"[80078, 95087, 86044, 62125, 74638, 77037, 323..."
2,2,"[77551, 61625, 53904, 60070, 77253, 83853, 785..."
3,3,"[43702, 3709, 2746, 74781, 64711, 55866, 73808..."
4,4,"[81829, 81828, 73676, 64855, 86490, 70787, 74,..."


In [67]:
def negative_samples(excl):
    samples = set(np.random.choice(len(article_id_to_idx), replace=False, size=500))
    neg_samples = samples - set(excl)
    return list(neg_samples)

In [144]:
# negative_samples(purchases_by_customer_id_idx.iloc[0]['article_id_idx'])

In [69]:
%%time
purchases_by_customer_id_idx['negative_samples'] = purchases_by_customer_id_idx['article_id_idx'].apply(negative_samples)

CPU times: user 20.6 s, sys: 723 ms, total: 21.3 s
Wall time: 21.1 s


In [70]:
purchases_by_customer_id_idx.head()

Unnamed: 0,customer_id_idx,article_id_idx,negative_samples
0,0,"[59458, 1469, 60253, 60259, 93585, 91841, 6727...","[21504, 73731, 12292, 52227, 6153, 62473, 4199..."
1,1,"[80078, 95087, 86044, 62125, 74638, 77037, 323...","[20481, 68611, 28678, 83975, 20489, 78858, 583..."
2,2,"[77551, 61625, 53904, 60070, 77253, 83853, 785...","[93184, 9222, 21513, 8202, 90129, 21523, 1022,..."
3,3,"[43702, 3709, 2746, 74781, 64711, 55866, 73808...","[73728, 68610, 59395, 80903, 29709, 14, 7182, ..."
4,4,"[81829, 81828, 73676, 64855, 86490, 70787, 74,...","[47105, 26627, 47108, 98309, 82948, 91147, 522..."


In [71]:
purchases_by_customer_id_idx['negative_samples'].apply(len).value_counts()

500    7776
499     901
498     126
497      18
496       4
Name: negative_samples, dtype: int64

In [72]:
negative_samples_df = purchases_by_customer_id_idx[['customer_id_idx', 'negative_samples']].explode('negative_samples')
negative_samples_df.head()

Unnamed: 0,customer_id_idx,negative_samples
0,0,21504
0,0,73731
0,0,12292
0,0,52227
0,0,6153


In [73]:
negative_samples_df.isnull().sum()

customer_id_idx     0
negative_samples    0
dtype: int64

In [74]:
negative_samples_df.dropna(subset=['negative_samples'], inplace=True)

In [75]:
negative_samples_df.rename(columns={'negative_samples': 'article_id_idx'}, inplace=True)
negative_samples_df['purchased'] = 0

In [76]:
negative_samples_df.head()

Unnamed: 0,customer_id_idx,article_id_idx,purchased
0,0,21504,0
0,0,73731,0
0,0,12292,0
0,0,52227,0
0,0,6153,0


In [77]:
positive_samples_df = train_set_processed.copy()[['customer_id_idx', 'article_id_idx']]
positive_samples_df['purchased'] = 1

positive_samples_df.head()

Unnamed: 0,customer_id_idx,article_id_idx,purchased
277801,6410,70955,1
277802,3335,7280,1
277803,3335,65588,1
277804,3335,65587,1
277805,4475,65002,1


In [78]:
training_data_full = positive_samples_df.append(negative_samples_df)
training_data_full.shape

(4668667, 3)

In [79]:
training_data_full['purchased'].mean()

0.05513136833275965

Can shuffle in `DataLoader`

In [80]:
training_data_full.head()

Unnamed: 0,customer_id_idx,article_id_idx,purchased
277801,6410,70955,1
277802,3335,7280,1
277803,3335,65588,1
277804,3335,65587,1
277805,4475,65002,1


In [81]:
class PurchasesDataset(Dataset):

    def __init__(self):
        self.customer_id_idx = training_data_full['customer_id_idx'].values
        self.article_id_idx = training_data_full['article_id_idx'].values
        self.purchased = training_data_full['purchased'].values

    def __len__(self):
        return self.customer_id_idx.shape[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        return self.customer_id_idx[idx], self.article_id_idx[idx], self.purchased[idx]

In [82]:
class MatrixFactorisation(nn.Module):

    def __init__(self):
        super(MatrixFactorisation, self).__init__()
        
        self.user_embeddings = nn.Embedding(num_embeddings=len(customer_id_to_idx), embedding_dim=100)
        
        self.item_embeddings = nn.Embedding(num_embeddings=len(article_id_to_idx), embedding_dim=100)

        torch.nn.init.xavier_uniform_(self.user_embeddings.weight)
        torch.nn.init.xavier_uniform_(self.item_embeddings.weight)

    def forward(self, user_id, item_id):
        
        user_emb = self.user_embeddings(user_id)
        item_emb = self.item_embeddings(item_id)
        
        scores = torch.diagonal(
            torch.matmul(user_emb, item_emb.T)
        )

        scores = nn.Sigmoid()(scores)
        
        return scores

In [83]:
matrix_fact_model = MatrixFactorisation()

In [84]:
matrix_fact_model.to(device)

MatrixFactorisation(
  (user_embeddings): Embedding(9208, 100)
  (item_embeddings): Embedding(105542, 100)
)

In [85]:
train_dataset = PurchasesDataset()

In [86]:
BATCH_SIZE = 2048

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)  

In [87]:
# # Selecting a subset of data
# subset = torch.utils.data.Subset(train_dataset, [1])

# train_loader = DataLoader(subset, batch_size=BATCH_SIZE)  

In [88]:
optimizer = torch.optim.Adam(params=matrix_fact_model.parameters(), lr=0.1)

Switched to BCELoss instead of weighted MSE. Had more success with this.

In [89]:
criterion = nn.BCELoss()

In [90]:
MAX_EPOCHS = 10
W_0 = 1
training_losses = []
dev_maps = []

for epoch in range(MAX_EPOCHS):
    
    for data in tqdm(train_loader):
        
        user_id, item_id, label = data
        
        user_id, item_id, label = user_id.to(device), item_id.to(device), label.to(device)

        optimizer.zero_grad()  # Set gradients to 0 otherwise will accumulate

        pred = matrix_fact_model(user_id, item_id)
        
        # loss = weighted_matrix_factorisation_loss(pred, label, w_0=W_0)

        loss = criterion(pred, label.float())

        loss.backward()  
        
        optimizer.step()
        
    # Compute training loss
    
    total_train_loss = 0

    matrix_fact_model.eval()

    with torch.no_grad():
        for data in DataLoader(train_dataset, batch_size=BATCH_SIZE):
            
            user_id, item_id, label = data
        
            user_id, item_id, label = user_id.to(device), item_id.to(device), label.to(device)

            optimizer.zero_grad()  # Set gradients to 0 otherwise will accumulate

            pred = matrix_fact_model(user_id, item_id)

            # loss = weighted_matrix_factorisation_loss(pred, label, w_0=W_0)

            loss = criterion(pred, label.float())

            total_train_loss += loss.item()

        mean_train_loss = total_train_loss / len(train_dataset)

        print(f"Training loss: {mean_train_loss}")
        training_losses.append(mean_train_loss)
    
    # Compute dev MAP@12

    pred = torch.matmul(matrix_fact_model.user_embeddings.weight, matrix_fact_model.item_embeddings.weight.T)

    predictions = torch.topk(pred, 12, dim=1).indices.tolist()

    dev_map12 = mapk(dev_actuals, predictions, k=12)

    print(f"MAP@12: {dev_map12}")
    dev_maps.append(dev_map12)

    matrix_fact_model.train()

100%|██████████| 2280/2280 [00:30<00:00, 75.86it/s]


Training loss: 0.0035067099433207477
MAP@12: 4.422821760283061e-05


100%|██████████| 2280/2280 [00:28<00:00, 79.75it/s] 


Training loss: 0.0034387138043914735
MAP@12: 0.0001614125182236637


100%|██████████| 2280/2280 [00:27<00:00, 82.71it/s] 


Training loss: 0.003363332001787468
MAP@12: 0.0003719470244238047


100%|██████████| 2280/2280 [00:27<00:00, 82.70it/s] 


Training loss: 0.0033105195088619197
MAP@12: 0.00011857480897728575


100%|██████████| 2280/2280 [00:27<00:00, 81.95it/s] 


Training loss: 0.003272328172785798
MAP@12: 1.9110958223445322e-05


100%|██████████| 2280/2280 [00:28<00:00, 79.37it/s] 


Training loss: 0.003249286345825887
MAP@12: 0.00011132842291621592


100%|██████████| 2280/2280 [00:28<00:00, 78.99it/s] 


Training loss: 0.003231796609728308
MAP@12: 0.0003411277603959063


100%|██████████| 2280/2280 [00:27<00:00, 82.72it/s] 


Training loss: 0.0032112713238001012
MAP@12: 0.00026388014004113074


100%|██████████| 2280/2280 [00:27<00:00, 82.61it/s]


Training loss: 0.0031850224969123785
MAP@12: 0.0002335802742149491


100%|██████████| 2280/2280 [00:28<00:00, 79.78it/s] 


Training loss: 0.003175771425521209
MAP@12: 0.00027893467328451847


### For a given item look at most similar items

In [133]:
matrix_fact_model.user_embeddings.weight

Parameter containing:
tensor([[ 7.8545e+00, -1.5250e+00,  7.7149e+00,  ...,  1.7752e+01,
          3.5420e+00,  6.0339e+00],
        [ 2.3610e+00,  4.7162e+00,  6.7235e+00,  ...,  4.9019e+00,
         -4.1044e+00,  3.1383e+00],
        [ 1.2069e+00, -7.3722e+00,  2.9536e+00,  ...,  9.5571e-01,
         -6.8044e-01,  1.2618e+00],
        ...,
        [ 1.9128e-02, -2.0981e-02, -1.4380e-02,  ..., -1.6451e-02,
         -1.8899e-02, -9.9659e-03],
        [-1.6112e-03, -9.4653e-03, -1.7701e-02,  ..., -1.4851e-02,
          1.7871e-02,  2.3728e-02],
        [-2.0579e-02,  1.8098e-02, -1.2059e-02,  ...,  8.2411e-03,
          6.3704e-03, -8.0350e-03]], device='cuda:0', requires_grad=True)

In [91]:
articles_df.head()

Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,detail_desc
0,108775015,Strap top,Vest top,Garment Upper body,Jersey top with narrow shoulder straps.
1,108775044,Strap top,Vest top,Garment Upper body,Jersey top with narrow shoulder straps.
2,108775051,Strap top (1),Vest top,Garment Upper body,Jersey top with narrow shoulder straps.
3,110065001,OP T-shirt (Idro),Bra,Underwear,"Microfibre T-shirt bra with underwired, moulde..."
4,110065002,OP T-shirt (Idro),Bra,Underwear,"Microfibre T-shirt bra with underwired, moulde..."


In [92]:
articles_df['article_id_idx'] = articles_df['article_id'].map(article_id_to_idx)

In [93]:
articles_df.head()

Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,detail_desc,article_id_idx
0,108775015,Strap top,Vest top,Garment Upper body,Jersey top with narrow shoulder straps.,0
1,108775044,Strap top,Vest top,Garment Upper body,Jersey top with narrow shoulder straps.,1
2,108775051,Strap top (1),Vest top,Garment Upper body,Jersey top with narrow shoulder straps.,2
3,110065001,OP T-shirt (Idro),Bra,Underwear,"Microfibre T-shirt bra with underwired, moulde...",3
4,110065002,OP T-shirt (Idro),Bra,Underwear,"Microfibre T-shirt bra with underwired, moulde...",4


In [105]:
np.random.seed(3)
sample_article_id_idx = articles_df.sample(n=1)['article_id_idx'].item()
sample_article_id_idx

75601

In [106]:
from sklearn.metrics.pairwise import cosine_similarity

In [107]:
%%time
sim = cosine_similarity(
    matrix_fact_model.item_embeddings.weight.cpu().detach().numpy()[sample_article_id_idx].reshape(1, -1), 
    matrix_fact_model.item_embeddings.weight.cpu().detach().numpy()
)

CPU times: user 69 ms, sys: 20.2 ms, total: 89.2 ms
Wall time: 73.5 ms


In [108]:
k = 10

closest_k = np.flip(np.argsort(sim)).flatten()[:k]
closest_k

array([75601, 52092, 64906, 94384, 53663, 69706, 42360, 35740, 55846,
       21771])

In [109]:
articles_df.iloc[closest_k]

Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,detail_desc,article_id_idx
75601,782138001,Lea PQ highshaft,Boots,Shoes,"Suede boots with gently rounded toes, soft, su...",75601
52092,700586001,SKINNY EMILY,Trousers,Garment Lower body,"Slim-fit jeans in washed, superstretch, flexib...",52092
64906,743218008,Bracelet Lincoln Italy,Bracelet,Accessories,Multi-strand bracelet in imitation leather wit...,64906
94384,862219003,Parrot cropped trousers,Trousers,Garment Lower body,Ankle-length suit trousers in woven fabric wit...,94384
53663,705382001,Speed Caramella earring,Earring,Accessories,Long metal earrings with plastic pendants in v...,53663
69706,757915001,Swish Super Push,Bikini top,Swimwear,"Bikini top with underwired, gathered, thickly ...",69706
42360,672460006,MARS Treggings,Trousers,Garment Lower body,Treggings in stretch cotton twill with an elas...,42360
35740,648254001,Bobby,Skirt,Garment Lower body,Calf-length skirt in a patterned crêpe weave w...,35740
55846,711846002,Noa mesh bag,Bag,Accessories,Bag in cotton mesh with round plastic handles ...,55846
21771,595318002,Flirty pompom terry pk,Hair string,Accessories,Hair elastics decorated with pompoms.,21771


### Look at recommendations for a given user

In [119]:
sample_customer = train_set_processed.sample(n=1)['customer_id_idx'].item()
sample_customer

7266

In [120]:
pred = torch.matmul(matrix_fact_model.user_embeddings.weight[sample_customer, :], matrix_fact_model.item_embeddings.weight.T)

In [121]:
closest_k = torch.topk(pred, k=10).indices
closest_k

tensor([ 85106, 101366, 101220,  77601,  25806,   2236,  88346,  74290,  89628,
         85223], device='cuda:0')

In [124]:
# Previous purchases
train_set_processed[train_set_processed['customer_id_idx'] == sample_customer].merge(
    articles_df,
    on='article_id_idx'
)

Unnamed: 0,customer_id_idx,article_id_idx,article_id,prod_name,product_type_name,product_group_name,detail_desc
0,7266,77253,788575001,Maja cargo Slim HW Denim,Trousers,Garment Lower body,"Jeans in washed, stretch denim with a high wai..."
1,7266,81405,805525001,Nottingham trucker,Jacket,Garment Upper body,"Short, boxy jacket in cotton twill with a coll..."
2,7266,73048,770315005,Alpha essential top,Top,Garment Upper body,"Top in airy jersey crêpe with a V-neck, short ..."
3,7266,40321,664074077,Charlie Top,Top,Garment Upper body,Straight-cut top in airy jersey crêpe with a b...
4,7266,85106,818024001,Toulon jumper,Sweater,Garment Upper body,"Wide top in a stretchy, fine knit with a light..."
5,7266,87171,826492006,Dame,Sweater,Garment Upper body,"Boxy-style jumper in a soft, fine knit contain..."
6,7266,17129,572998005,Beverly HW Loose Mom Fit Dnm,Trousers,Garment Lower body,"5-pocket, ankle-length jeans in washed denim w..."
7,7266,87358,827411001,Chubba Chubb Highwaist brazili,Swimwear bottom,Swimwear,"Fully lined, waist-high bikini bottoms with ga..."
8,7266,45795,684209013,Simple as That Triangle Top,Bikini top,Swimwear,"Lined, non-wired, triangle bikini top with a w..."
9,7266,45802,684209027,Simple as That Triangle Top,Bikini top,Swimwear,"Lined, non-wired, triangle bikini top with a w..."


In [132]:
articles_df.iloc[closest_k.cpu().detach().numpy()]

Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,detail_desc,article_id_idx
85106,818024001,Toulon jumper,Sweater,Garment Upper body,"Wide top in a stretchy, fine knit with a light...",85106
101366,896152001,Amelie,T-shirt,Garment Upper body,"Top in a soft, fine knit containing some wool ...",101366
101220,895002002,Steam smock top,Top,Garment Upper body,Short top in crinkled jersey with a smocked bo...,101220
77601,790368001,Pantha PU leggings,Trousers,Garment Lower body,Leggings in stretch fabric. High waist with co...,77601
25806,614854013,Fiona brazilian (Acacia) 4p,Underwear bottom,Underwear,Brazilian briefs in soft jersey with lace trim...,25806
2236,399223001,Curvy Jeggings HW Ankle,Trousers,Garment Lower body,"Jeggings in washed, superstretch denim with a ...",2236
88346,832473005,Asa smock top,Top,Garment Upper body,Tube top in smocked cotton jersey with a frill...,88346
74290,776237011,Shake it in Balconette,Bikini top,Swimwear,"Lined balconette bikini top with underwired, p...",74290
89628,839227001,Big satin scrunchie,Hair string,Accessories,Elasticated scrunchie covered in satin with a ...,89628
85223,818614007,Samantha seamless bandeau,Bra,Underwear,Strapless bandeau bra in jersey with padded cu...,85223


### Test set evaluation

In [134]:
pred = torch.matmul(matrix_fact_model.user_embeddings.weight, matrix_fact_model.item_embeddings.weight.T)

In [135]:
pred.shape

torch.Size([9208, 105542])

In [136]:
pred

tensor([[-7.3850e+02, -5.8736e+02, -3.0175e+02,  ..., -3.2758e+02,
         -6.7557e+02, -2.4729e+02],
        [-7.9312e+02, -7.1850e+02, -2.0791e+02,  ..., -7.2305e+02,
         -9.8646e+02, -4.3774e+02],
        [-6.6568e+02, -6.7205e+02, -6.9188e+02,  ..., -7.9726e+02,
         -1.0750e+03, -3.5086e+02],
        ...,
        [-5.5384e-01,  6.9027e-01, -1.2639e-01,  ..., -2.6822e-01,
         -5.3654e-01, -7.8011e-02],
        [ 5.4282e-01, -2.5723e-01, -1.4271e-01,  ..., -1.9621e-02,
         -3.0582e-02, -8.6693e-02],
        [ 3.7253e-01,  1.0631e+00,  4.7211e-01,  ...,  3.7026e-01,
          3.5100e-01,  1.9151e-02]], device='cuda:0', grad_fn=<MmBackward0>)

In [137]:
%%time
predictions = torch.topk(pred, 12, dim=1).indices.tolist()

CPU times: user 147 ms, sys: 989 µs, total: 148 ms
Wall time: 148 ms


In [138]:
test_set['article_id_idx'] = test_set['article_id'].map(article_id_to_idx)

In [139]:
test_set_by_customer = test_set.groupby('customer_id').apply(lambda x: list(x['article_id_idx'])).reset_index().rename(columns={0: 'article_id_idx'})
test_set_by_customer.head()

Unnamed: 0,customer_id,article_id_idx
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,[78503]
1,0015f37f752a41a75c3be6f3f92deedc4c87d039f1758e...,"[74893, 74893, 24872, 104987, 9801, 9801, 6912..."
2,00282135561702f5b3b750fa3382d8fd83ce5d761a507e...,"[92815, 86041]"
3,00356a94bb9bed341f6dba58ad722974b01a1cbd9f06ef...,"[104211, 17044, 17044, 16024, 29250, 98445, 66..."
4,00462904b288681a9facb555f75dd2cf4d0f730a6e0ea7...,"[87467, 99966, 42626, 79834, 97540, 103280, 79..."


In [140]:
customers_ordered = test_set_by_customer['customer_id'].map(customer_id_to_idx).tolist()

In [141]:
actuals = test_set_by_customer['article_id_idx'].to_list()

In [143]:
mapk(actuals, predictions, k=12)

0.0005853456357773256

CF doesn't do better than simple heuristic (top 12 most popular). Possibly explains why earlier attempts at training embeddings failed.