Simple model using approach described here: https://developers.google.com/machine-learning/recommendation/collaborative/matrix

In [1]:
import os
import datetime

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [2]:
os.chdir('..')

In [3]:
from fashion_recommendations.metrics.average_precision import mapk

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [5]:
train_set_orig = pd.read_csv('data/splits/train_subset.tsv', sep='\t', dtype={'article_id': str})
print(train_set_orig.shape)
train_set_orig.head()

(588758, 5)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,016d8f0519d9e0572b3abebeab87408bad7a5c3a284016...,650193004,0.050831,1
3,2018-09-20,016d8f0519d9e0572b3abebeab87408bad7a5c3a284016...,527687006,0.101678,1
4,2018-09-20,02bfe1a5248beb9cd28ad4ac630a6d75e78d9a3e14551a...,668767002,0.016932,2


In [6]:
test_set = pd.read_csv('data/splits/test_subset.tsv', sep='\t', dtype={'article_id': str})
print(test_set.shape)
test_set.head()

(32995, 5)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2020-09-15,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,794321007,0.061,2
1,2020-09-15,0015f37f752a41a75c3be6f3f92deedc4c87d039f1758e...,778745010,0.033881,2
2,2020-09-15,0015f37f752a41a75c3be6f3f92deedc4c87d039f1758e...,778745010,0.033881,2
3,2020-09-15,0015f37f752a41a75c3be6f3f92deedc4c87d039f1758e...,610776105,0.008458,2
4,2020-09-15,0015f37f752a41a75c3be6f3f92deedc4c87d039f1758e...,929745001,0.050831,2


### Map `customer_id` and `article_id` to indices

Test set doesn't include anyone not in training set

In [7]:
unique_customers_df = train_set_orig[['customer_id']].drop_duplicates().reset_index(drop=True)

customer_id_to_idx = dict(
    zip(
        unique_customers_df['customer_id'],
        unique_customers_df.index
    )
)

len(customer_id_to_idx)

9208

In [8]:
articles_df = pd.read_csv(
    'data/articles.csv', 
    dtype={'article_id': str}, 
    usecols=['article_id', 'prod_name', 'product_type_name', 'product_group_name', 'detail_desc']
)
print(articles_df.shape)
articles_df.head()

(105542, 5)


Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,detail_desc
0,108775015,Strap top,Vest top,Garment Upper body,Jersey top with narrow shoulder straps.
1,108775044,Strap top,Vest top,Garment Upper body,Jersey top with narrow shoulder straps.
2,108775051,Strap top (1),Vest top,Garment Upper body,Jersey top with narrow shoulder straps.
3,110065001,OP T-shirt (Idro),Bra,Underwear,"Microfibre T-shirt bra with underwired, moulde..."
4,110065002,OP T-shirt (Idro),Bra,Underwear,"Microfibre T-shirt bra with underwired, moulde..."


In [9]:
article_id_to_idx = dict(
    zip(
        articles_df['article_id'],
        articles_df.index
    )
)

In [10]:
train_set_orig['customer_id_idx'] = train_set_orig['customer_id'].map(customer_id_to_idx)
train_set_orig['article_id_idx'] = train_set_orig['article_id'].map(article_id_to_idx)

train_set_orig.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,customer_id_idx,article_id_idx
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,0,40179
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,0,10520
2,2018-09-20,016d8f0519d9e0572b3abebeab87408bad7a5c3a284016...,650193004,0.050831,1,1,36302
3,2018-09-20,016d8f0519d9e0572b3abebeab87408bad7a5c3a284016...,527687006,0.101678,1,1,8963
4,2018-09-20,02bfe1a5248beb9cd28ad4ac630a6d75e78d9a3e14551a...,668767002,0.016932,2,2,41390


### Create development set

In [11]:
train_set_orig['t_dat'] = pd.to_datetime(train_set_orig['t_dat'])

In [12]:
end = train_set_orig['t_dat'].max()
endm1 = end - datetime.timedelta(days=7)

endm1, end

(Timestamp('2020-09-07 00:00:00'), Timestamp('2020-09-14 00:00:00'))

In [13]:
train_set_orig[train_set_orig['t_dat'] < endm1]['customer_id'].nunique()

9148

In [14]:
train_set_orig[train_set_orig['t_dat'] >= endm1]['customer_id'].nunique()

1998

In [15]:
# Remove customers from dev set who are not in the new training set
customers_to_remove = (
    set(train_set_orig[train_set_orig['t_dat'] >= endm1]['customer_id']) - 
    set(train_set_orig[train_set_orig['t_dat'] < endm1]['customer_id'])
)

In [16]:
len(customers_to_remove)

60

In [17]:
train_set = train_set_orig.copy()[train_set_orig['t_dat'] < endm1]

dev_set = train_set_orig.copy()[train_set_orig['t_dat'] >= endm1]

train_set.shape, dev_set.shape

((580395, 7), (8363, 7))

In [18]:
dev_set = dev_set.copy()[~dev_set['customer_id'].isin(customers_to_remove)]
print(dev_set.shape)
dev_set.head()

(8131, 7)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,customer_id_idx,article_id_idx
580395,2020-09-07,00639e775b90554156986100685c4d408723c77e411e37...,891591007,0.084729,2,732,100648
580396,2020-09-07,00639e775b90554156986100685c4d408723c77e411e37...,891591001,0.084729,2,732,100645
580397,2020-09-07,016f3b7c2b7652870b4b2dbdcb1805a7c8c262036f5511...,909924004,0.033881,2,1057,103196
580398,2020-09-07,016f3b7c2b7652870b4b2dbdcb1805a7c8c262036f5511...,921266005,0.016932,2,1057,104310
580399,2020-09-07,016f3b7c2b7652870b4b2dbdcb1805a7c8c262036f5511...,857163001,0.013542,2,1057,93169


In [19]:
dev_set_by_customer = dev_set.groupby('customer_id').apply(lambda x: list(x['article_id_idx'])).reset_index().rename(columns={0: 'article_id_idx'})
dev_set_by_customer.head()

Unnamed: 0,customer_id,article_id_idx
0,00462904b288681a9facb555f75dd2cf4d0f730a6e0ea7...,[102443]
1,0054c50274d19af58d53ef3ce0c004bea446c80bd51cf2...,"[72000, 16023, 71167]"
2,00639e775b90554156986100685c4d408723c77e411e37...,"[100648, 100645, 100645]"
3,00798bd464457d23d6af401715fe32d5c676ad9ee4010d...,[103885]
4,0099238196d8f71659fceaa115b36e400398bcfc169b5f...,"[95789, 95789, 3091, 99184, 99184, 93416, 9341..."


In [20]:
customer_id_idx_ordered = dev_set_by_customer['customer_id'].map(customer_id_to_idx).tolist()

In [21]:
dev_actuals = dev_set_by_customer['article_id_idx'].to_list()

### Define loss function

In [22]:
labels = torch.tensor([1, 1, 0, 0])
preds = torch.tensor([0.9, 0.2, 0.3, 0.4])

In [23]:
obs_mask = torch.where(labels == 1)
obs_mask

(tensor([0, 1]),)

In [24]:
unobs_mask = torch.where(labels == 0)
unobs_mask

(tensor([2, 3]),)

In [25]:
w_0 = 2

In [26]:
obs_loss = ((labels[obs_mask] - preds[obs_mask])**2).sum()
obs_loss

tensor(0.6500)

In [27]:
unobs_loss = ((labels[unobs_mask] - preds[unobs_mask])**2).sum()
unobs_loss

tensor(0.2500)

In [28]:
total_loss = obs_loss + w_0 * unobs_loss

In [29]:
loss = total_loss / len(labels)
loss

tensor(0.2875)

In [30]:
def weighted_matrix_factorisation_loss(preds, labels, w_0=1):
    obs_mask = torch.where(labels == 1)

    unobs_mask = torch.where(labels == 0)

    obs_loss = ((labels[obs_mask] - preds[obs_mask])**2).sum()

    unobs_loss = (preds[unobs_mask]**2).sum()

    total_loss = obs_loss + w_0 * unobs_loss

    loss = total_loss / len(labels)
    
    return loss

In [31]:
weighted_matrix_factorisation_loss(preds, labels, w_0=2)

tensor(0.2875)

### Format data

Since all the users in the test said are included in training we utilise the full training set.

In [32]:
train_set.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,customer_id_idx,article_id_idx
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,0,40179
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,0,10520
2,2018-09-20,016d8f0519d9e0572b3abebeab87408bad7a5c3a284016...,650193004,0.050831,1,1,36302
3,2018-09-20,016d8f0519d9e0572b3abebeab87408bad7a5c3a284016...,527687006,0.101678,1,1,8963
4,2018-09-20,02bfe1a5248beb9cd28ad4ac630a6d75e78d9a3e14551a...,668767002,0.016932,2,2,41390


- Keep distinct purchases only
- Discard date, price and channel columns

In [33]:
train_set_processed = train_set.copy()[['customer_id_idx', 'article_id_idx']].drop_duplicates(subset=['customer_id_idx', 'article_id_idx'])
train_set_processed.shape

(490712, 2)

In [34]:
train_set_processed.head()

Unnamed: 0,customer_id_idx,article_id_idx
0,0,40179
1,0,10520
2,1,36302
3,1,8963
4,2,41390


For each customer sample up to X non-purchased items to use as negative samples

In [35]:
purchases_by_customer_id_idx = train_set_processed.groupby('customer_id_idx')['article_id_idx'].apply(lambda x: list(x)).reset_index()
purchases_by_customer_id_idx.head()

Unnamed: 0,customer_id_idx,article_id_idx
0,0,"[40179, 10520, 18197, 59458, 1469, 60253, 6025..."
1,1,"[36302, 8963, 33359, 40263, 41176, 30842, 1581..."
2,2,"[41390, 46302, 46085, 38438, 37854, 44072, 466..."
3,3,"[31688, 12233, 14832, 31682, 31690, 17615, 204..."
4,4,"[15136, 22636, 33743, 50091, 74, 42605, 7230, ..."


In [36]:
def negative_samples(excl):
    samples = set(np.random.choice(len(article_id_to_idx), replace=False, size=10))
    neg_samples = samples - set(excl)
    return list(neg_samples)

In [37]:
negative_samples(purchases_by_customer_id_idx.iloc[0]['article_id_idx'])

[38018, 868, 83620, 94150, 54984, 80712, 47146, 20530, 83321, 20826]

In [38]:
%%time
purchases_by_customer_id_idx['negative_samples'] = purchases_by_customer_id_idx['article_id_idx'].apply(negative_samples)

CPU times: user 13.9 s, sys: 27.5 ms, total: 13.9 s
Wall time: 14 s


In [39]:
purchases_by_customer_id_idx.head()

Unnamed: 0,customer_id_idx,article_id_idx,negative_samples
0,0,"[40179, 10520, 18197, 59458, 1469, 60253, 6025...","[103427, 80422, 103207, 51402, 9707, 4330, 246..."
1,1,"[36302, 8963, 33359, 40263, 41176, 30842, 1581...","[79872, 64168, 45165, 64718, 50990, 20319, 103..."
2,2,"[41390, 46302, 46085, 38438, 37854, 44072, 466...","[79648, 12481, 24676, 81029, 59366, 103564, 87..."
3,3,"[31688, 12233, 14832, 31682, 31690, 17615, 204...","[39049, 13194, 64489, 49037, 61779, 82452, 536..."
4,4,"[15136, 22636, 33743, 50091, 74, 42605, 7230, ...","[64993, 69218, 259, 26151, 67817, 59309, 55533..."


In [40]:
purchases_by_customer_id_idx['negative_samples'].apply(len).value_counts()

10    9101
9       47
Name: negative_samples, dtype: int64

In [41]:
negative_samples_df = purchases_by_customer_id_idx[['customer_id_idx', 'negative_samples']].explode('negative_samples')
negative_samples_df.head()

Unnamed: 0,customer_id_idx,negative_samples
0,0,103427
0,0,80422
0,0,103207
0,0,51402
0,0,9707


In [42]:
negative_samples_df.rename(columns={'negative_samples': 'article_id_idx'}, inplace=True)
negative_samples_df['purchased'] = 0

In [43]:
negative_samples_df.head()

Unnamed: 0,customer_id_idx,article_id_idx,purchased
0,0,103427,0
0,0,80422,0
0,0,103207,0
0,0,51402,0
0,0,9707,0


In [44]:
positive_samples_df = train_set_processed.copy()[['customer_id_idx', 'article_id_idx']]
positive_samples_df['purchased'] = 1

positive_samples_df.head()

Unnamed: 0,customer_id_idx,article_id_idx,purchased
0,0,40179,1
1,0,10520,1
2,1,36302,1
3,1,8963,1
4,2,41390,1


In [45]:
training_data_full = positive_samples_df.append(negative_samples_df)
training_data_full.shape

  training_data_full = positive_samples_df.append(negative_samples_df)


(582145, 3)

In [46]:
training_data_full['purchased'].mean()

0.8429377560573397

Can shuffle in `DataLoader`

In [47]:
training_data_full.head()

Unnamed: 0,customer_id_idx,article_id_idx,purchased
0,0,40179,1
1,0,10520,1
2,1,36302,1
3,1,8963,1
4,2,41390,1


In [48]:
class PurchasesDataset(Dataset):

    def __init__(self):
        self.customer_id_idx = training_data_full['customer_id_idx'].values
        self.article_id_idx = training_data_full['article_id_idx'].values
        self.purchased = training_data_full['purchased'].values

    def __len__(self):
        return self.customer_id_idx.shape[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        return self.customer_id_idx[idx], self.article_id_idx[idx], self.purchased[idx]

In [49]:
class MatrixFactorisation(nn.Module):

    def __init__(self):
        super(MatrixFactorisation, self).__init__()
        
        self.user_embeddings = nn.Embedding(num_embeddings=len(customer_id_to_idx), embedding_dim=100)
        
        self.item_embeddings = nn.Embedding(num_embeddings=len(article_id_to_idx), embedding_dim=100)

    def forward(self, user_id, item_id):
        
        user_emb = self.user_embeddings(user_id)
        item_emb = self.item_embeddings(item_id)
        
        scores = torch.diagonal(
            torch.matmul(user_emb, item_emb.T)
        )
        
        return scores

In [66]:
matrix_fact_model = MatrixFactorisation()

In [67]:
train_dataset = PurchasesDataset()

In [68]:
BATCH_SIZE = 2048

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)  

In [69]:
# # Selecting a subset of data
# subset = torch.utils.data.Subset(train_dataset, [1])

# train_loader = DataLoader(subset, batch_size=BATCH_SIZE)  

In [70]:
optimizer = torch.optim.Adam(params=matrix_fact_model.parameters(), lr=0.001)

In [71]:
MAX_EPOCHS = 10
training_losses = []
dev_maps = []

for epoch in range(MAX_EPOCHS):
    
    for data in tqdm(train_loader):
        
        user_id, item_id, label = data
        
        user_id, item_id, label = user_id.to(device), item_id.to(device), label.to(device)

        optimizer.zero_grad()  # Set gradients to 0 otherwise will accumulate

        pred = matrix_fact_model(user_id, item_id)
        
        loss = weighted_matrix_factorisation_loss(pred, label, w_0=1)

        loss.backward()  
        
        optimizer.step()
        
    # Compute training loss
    
    total_train_loss = 0

    matrix_fact_model.eval()

    with torch.no_grad():
        for data in DataLoader(train_dataset, batch_size=BATCH_SIZE):
            
            user_id, item_id, label = data
        
            user_id, item_id, label = user_id.to(device), item_id.to(device), label.to(device)

            optimizer.zero_grad()  # Set gradients to 0 otherwise will accumulate

            pred = matrix_fact_model(user_id, item_id)

            loss = weighted_matrix_factorisation_loss(pred, label, w_0=100)

            total_train_loss += loss.item()

        mean_train_loss = total_train_loss / len(train_dataset)

        print(f"Training loss: {mean_train_loss}")
        training_losses.append(mean_train_loss)
    
    matrix_fact_model.train()
    
# Compute dev MAP@12

pred = torch.matmul(matrix_fact_model.user_embeddings.weight, matrix_fact_model.item_embeddings.weight.T)

recommendations = torch.argsort(input=pred, dim=1, descending=True)

predictions = recommendations[customer_id_idx_ordered, :12].tolist()

dev_map12 = mapk(dev_actuals, predictions, k=12)

print(f"MAP@12: {dev_map12}")
dev_maps.append(dev_map12)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 285/285 [00:28<00:00, 10.06it/s]


Training loss: 0.6864453676328911


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 285/285 [00:32<00:00,  8.65it/s]


Training loss: 0.5573882387203504


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 285/285 [00:33<00:00,  8.49it/s]


Training loss: 0.4500347234203708


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 285/285 [00:35<00:00,  8.03it/s]


Training loss: 0.3619071859273522


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 285/285 [00:33<00:00,  8.44it/s]


Training loss: 0.28996176327847245


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 285/285 [00:31<00:00,  9.18it/s]


Training loss: 0.23141957549974126


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 285/285 [00:31<00:00,  8.95it/s]


Training loss: 0.18402715968572114


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 285/285 [00:26<00:00, 10.90it/s]


Training loss: 0.14554411742297516


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 285/285 [00:25<00:00, 11.09it/s]


Training loss: 0.1146071857538595


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 285/285 [00:25<00:00, 11.08it/s]


Training loss: 0.08977562148054835
MAP@12: 0.0


In [45]:
matrix_fact_model.user_embeddings.weight

Parameter containing:
tensor([[ 0.4442, -0.8025, -0.1586,  ..., -0.1448,  0.5957,  0.4347],
        [-0.3507, -0.2127,  0.3654,  ...,  0.0209,  0.1289, -0.5420],
        [ 0.2725,  0.2459,  0.0296,  ...,  0.7285, -0.0817,  0.1481],
        ...,
        [-0.3547, -0.1531,  0.2274,  ...,  0.0315, -0.1099,  0.4630],
        [-0.1552, -0.2548,  0.4993,  ..., -0.1012,  0.1220, -0.2450],
        [ 0.0562, -0.0025, -0.0429,  ..., -0.0906, -0.0200,  0.1363]],
       requires_grad=True)

In [47]:
articles_df.head()

Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,detail_desc
0,108775015,Strap top,Vest top,Garment Upper body,Jersey top with narrow shoulder straps.
1,108775044,Strap top,Vest top,Garment Upper body,Jersey top with narrow shoulder straps.
2,108775051,Strap top (1),Vest top,Garment Upper body,Jersey top with narrow shoulder straps.
3,110065001,OP T-shirt (Idro),Bra,Underwear,"Microfibre T-shirt bra with underwired, moulde..."
4,110065002,OP T-shirt (Idro),Bra,Underwear,"Microfibre T-shirt bra with underwired, moulde..."


In [48]:
articles_df['article_id_idx'] = articles_df['article_id'].map(article_id_to_idx)

In [49]:
articles_df.head()

Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,detail_desc,article_id_idx
0,108775015,Strap top,Vest top,Garment Upper body,Jersey top with narrow shoulder straps.,0
1,108775044,Strap top,Vest top,Garment Upper body,Jersey top with narrow shoulder straps.,1
2,108775051,Strap top (1),Vest top,Garment Upper body,Jersey top with narrow shoulder straps.,2
3,110065001,OP T-shirt (Idro),Bra,Underwear,"Microfibre T-shirt bra with underwired, moulde...",3
4,110065002,OP T-shirt (Idro),Bra,Underwear,"Microfibre T-shirt bra with underwired, moulde...",4


In [50]:
from sklearn.metrics.pairwise import cosine_similarity

In [51]:
%%time
sim = cosine_similarity(
    matrix_fact_model.item_embeddings.weight.detach().numpy()[3].reshape(1, -1), 
    matrix_fact_model.item_embeddings.weight.detach().numpy()
)

CPU times: user 8.65 ms, sys: 3.24 ms, total: 11.9 ms
Wall time: 8.4 ms


In [52]:
k = 10

closest_k = np.flip(np.argsort(sim)).flatten()[:k]
closest_k

array([     3,  82703, 101990,  25474,  63585,  93517,  85960,  15959,
        45779,  44749])

In [53]:
articles_df[
    articles_df['article_id_idx'].isin(closest_k)
]

Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,detail_desc,article_id_idx
3,110065001,OP T-shirt (Idro),Bra,Underwear,"Microfibre T-shirt bra with underwired, moulde...",3
15959,568556001,LACE SKIRT,Skirt,Garment Lower body,Short imitation suede skirt in a double layer ...,15959
25474,613081001,Jacket Oversize 19.99,Jacket,Garment Upper body,Oversized denim jacket with a collar and smoot...,25474
44749,681120001,DUNGAREE L/L SLIM STEVIE,Dungarees,Garment Full body,Dungarees in washed stretch denim with a chest...,44749
45779,684186002,Nottie RW wide cropped,Trousers,Garment Lower body,Ankle-length twill trousers with a regular wai...,45779
63585,739590002,Timeles Cheeky Brief,Swimwear bottom,Swimwear,Fully lined bikini bottoms with a low waist an...,63585
82703,809570003,Regular Straight 89,Trousers,Garment Lower body,"5-pocket jeans in soft, stretch denim with wor...",82703
85960,821771001,Brenda DEAL set,Shorts,Garment Lower body,Set with a T-shirt in cotton jersey and pair o...,85960
93517,858487001,Manila denim shorts,Shorts,Garment Lower body,"Shorts in soft, stretch denim with an elastica...",93517
101990,901316001,Special Occ SCALA dress,Dress,Garment Full body,"Dress in crisp tulle with an embroidered yoke,...",101990


In [54]:
train_set_processed.groupby('customer_id_idx')['article_id'].count()

customer_id_idx
0        13
1        54
2       136
3        99
4        40
       ... 
9203      3
9204      1
9205      5
9206      6
9207     13
Name: article_id, Length: 9208, dtype: int64

In [55]:
train_set_processed[train_set_processed['customer_id_idx'] == 0]

Unnamed: 0,customer_id,article_id,article_id_idx,customer_id_idx
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,40179,0
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,10520,0
109623,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,578020002,18197,0
383502,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,723529001,59458,0
383503,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,351484002,1469,0
424938,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,727808001,60253,0
424939,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,727808007,60259,0
441803,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,858883002,93585,0
441804,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,851400006,91841,0
441805,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,750424014,67274,0


In [56]:
matrix_fact_model.item_embeddings.weight.shape

torch.Size([105542, 10])

In [104]:
matrix_fact_model.user_embeddings.weight.shape

torch.Size([9208, 10])

In [120]:
pred = torch.matmul(matrix_fact_model.user_embeddings.weight[3, :], matrix_fact_model.item_embeddings.weight.T).detach().numpy()

In [121]:
k = 10

closest_k = np.flip(np.argsort(pred))[:k]
closest_k

array([74339, 35787,  2635,   159,  5532, 93755,  3240, 40032,  2660,
       10673])

In [123]:
train_set_processed[train_set_processed['customer_id_idx'] == 3].merge(
    articles_df,
    on='article_id'
)

Unnamed: 0,customer_id,article_id,article_id_idx_x,customer_id_idx,prod_name,product_type_name,product_group_name,detail_desc,article_id_idx_y
0,02d796ea767fa2e94fc6228fe70d8af1a570da973c32f7...,0633130013,31688,3,Zebra sweater TOP PRODUCT,Sweater,Garment Upper body,"Top in lightweight, printed sweatshirt fabric ...",31688
1,02d796ea767fa2e94fc6228fe70d8af1a570da973c32f7...,0552471001,12233,3,KELLY S.8,Shirt,Garment Upper body,Longer shirt in a cotton weave with a collar a...,12233
2,02d796ea767fa2e94fc6228fe70d8af1a570da973c32f7...,0564358010,14832,3,PAUL R-NECK,Sweater,Garment Upper body,"Jumper in a soft, fine cotton knit with a roun...",14832
3,02d796ea767fa2e94fc6228fe70d8af1a570da973c32f7...,0633130007,31682,3,Zebra sweater TOP PRODUCT,Sweater,Garment Upper body,"Top in lightweight, printed sweatshirt fabric ...",31682
4,02d796ea767fa2e94fc6228fe70d8af1a570da973c32f7...,0633130015,31690,3,Zebra sweater TOP PRODUCT,Sweater,Garment Upper body,"Top in lightweight, printed sweatshirt fabric ...",31690
...,...,...,...,...,...,...,...,...,...
94,02d796ea767fa2e94fc6228fe70d8af1a570da973c32f7...,0337991001,1300,3,Sigge Dressed Belt,Belt,Accessories,Leather belt with a metal buckle. Width 3 cm.,1300
95,02d796ea767fa2e94fc6228fe70d8af1a570da973c32f7...,0616598002,26260,3,Slim Straight 5pkt Midprice,Trousers,Garment Lower body,5-pocket jeans in washed denim with a regular ...,26260
96,02d796ea767fa2e94fc6228fe70d8af1a570da973c32f7...,0337991002,1301,3,Sigge Dressed Belt,Belt,Accessories,Leather belt with a metal buckle. Width 3 cm.,1301
97,02d796ea767fa2e94fc6228fe70d8af1a570da973c32f7...,0714026005,56486,3,Manson slim trs TVP J,Trousers,Garment Lower body,Suit trousers in woven fabric with a concealed...,56486


In [124]:
articles_df[articles_df['article_id_idx'].isin(closest_k)]

Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,detail_desc,article_id_idx
159,187949032,Padded pyjama,Pyjama jumpsuit/playsuit,Nightwear,Lightly padded all-in-one pyjamas in soft cott...,159
2635,418545020,ALGOT 2-p body,Bodysuit,Garment Upper body,Wrapover bodysuits in soft organic cotton jers...,2635
2660,421387032,Isak graphic tee,T-shirt,Garment Upper body,"Running top in fast-drying, breathable functio...",2660
3240,451380016,Bruce denim,Trousers,Garment Lower body,5-pocket jeans in washed stretch denim with a ...,3240
5532,497225022,SB Aston shorts,Swimwear bottom,Swimwear,"Swim shorts with an elasticated, drawstring wa...",5532
10673,542473002,Amelie,Shirt,Garment Upper body,"Shirt in woven fabric with a collar, buttons d...",10673
35787,648374001,Kendal fancy slipon co-lab SG,Other shoe,Shoes,Satin trainers with elastic gores in the sides...,35787
40032,663378005,Palmer chelsea PQ,Boots,Shoes,Chelsea boots with elastic gores in the sides ...,40032
74339,776532002,Bruno beanie,Beanie,Accessories,Hat knitted in a soft cotton blend with ear fl...,74339
93755,859476002,Wilma tank,Vest top,Garment Upper body,Sleeveless top in ribbed cotton jersey with a ...,93755


### Test set evaluation

In [46]:
pred = torch.matmul(matrix_fact_model.user_embeddings.weight, matrix_fact_model.item_embeddings.weight.T)

In [47]:
pred.shape

torch.Size([9208, 105542])

In [48]:
pred

tensor([[-1.0947, -3.2868, -4.0063,  ..., -1.0582, -3.7142, -3.5334],
        [ 2.8145, -1.1084, -2.8709,  ..., -1.4172, -0.1076,  1.4909],
        [-0.2448, -0.3599, -2.9249,  ...,  4.9391, -4.7780,  0.9843],
        ...,
        [ 3.9658, -4.4764,  1.6260,  ...,  3.0291,  0.7243,  0.3286],
        [ 1.2575,  1.1697,  0.1332,  ..., -3.8633,  0.0971,  0.4681],
        [ 0.2794, -0.3962,  0.8730,  ..., -0.2340,  0.4901, -1.9022]],
       grad_fn=<MmBackward0>)

In [49]:
%%time
recommendations = torch.argsort(input=pred, dim=1, descending=True)

CPU times: user 1min 37s, sys: 35.5 s, total: 2min 13s
Wall time: 2min 38s


In [50]:
test_set['article_id_idx'] = test_set['article_id'].map(article_id_to_idx)

In [51]:
test_set_by_customer = test_set.groupby('customer_id').apply(lambda x: list(x['article_id_idx'])).reset_index().rename(columns={0: 'article_id_idx'})
test_set_by_customer.head()

Unnamed: 0,customer_id,article_id_idx
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,[78503]
1,0015f37f752a41a75c3be6f3f92deedc4c87d039f1758e...,"[74893, 74893, 24872, 104987, 9801, 9801, 6912..."
2,00282135561702f5b3b750fa3382d8fd83ce5d761a507e...,"[92815, 86041]"
3,00356a94bb9bed341f6dba58ad722974b01a1cbd9f06ef...,"[104211, 17044, 17044, 16024, 29250, 98445, 66..."
4,00462904b288681a9facb555f75dd2cf4d0f730a6e0ea7...,"[87467, 99966, 42626, 79834, 97540, 103280, 79..."


In [52]:
customers_ordered = test_set_by_customer['customer_id'].map(customer_id_to_idx).tolist()

In [53]:
predictions = recommendations[customers_ordered, :12].tolist()

In [54]:
actuals = test_set_by_customer['article_id_idx'].to_list()

In [55]:
from fashion_recommendations.metrics.average_precision import mapk

In [61]:
mapk(actuals, predictions, k=5)

3.258036490008689e-05