Simple model using approach described here: https://developers.google.com/machine-learning/recommendation/collaborative/matrix

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import os
import datetime

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [3]:
# os.chdir('..')
os.chdir('drive/My Drive/Colab Notebooks/Github/fashion-recommendations') 

In [4]:
from fashion_recommendations.metrics.average_precision import mapk

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [6]:
train_set_orig = pd.read_csv('data/splits/train_subset.tsv', sep='\t', dtype={'article_id': str})
print(train_set_orig.shape)
train_set_orig.head()

(588758, 5)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,016d8f0519d9e0572b3abebeab87408bad7a5c3a284016...,650193004,0.050831,1
3,2018-09-20,016d8f0519d9e0572b3abebeab87408bad7a5c3a284016...,527687006,0.101678,1
4,2018-09-20,02bfe1a5248beb9cd28ad4ac630a6d75e78d9a3e14551a...,668767002,0.016932,2


In [7]:
train_set_orig['t_dat'] = pd.to_datetime(train_set_orig['t_dat'])

In [8]:
article_id_to_keep = train_set_orig[train_set_orig['t_dat'] > '2020-01-01']['article_id'].unique()

In [9]:
print(train_set_orig.shape)
train_set_orig = train_set_orig.copy()[train_set_orig['article_id'].isin(article_id_to_keep)]
print(train_set_orig.shape)

(588758, 5)
(395219, 5)


In [10]:
test_set = pd.read_csv('data/splits/test_subset.tsv', sep='\t', dtype={'article_id': str})
print(test_set.shape)
test_set.head()

(32995, 5)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2020-09-15,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,794321007,0.061,2
1,2020-09-15,0015f37f752a41a75c3be6f3f92deedc4c87d039f1758e...,778745010,0.033881,2
2,2020-09-15,0015f37f752a41a75c3be6f3f92deedc4c87d039f1758e...,778745010,0.033881,2
3,2020-09-15,0015f37f752a41a75c3be6f3f92deedc4c87d039f1758e...,610776105,0.008458,2
4,2020-09-15,0015f37f752a41a75c3be6f3f92deedc4c87d039f1758e...,929745001,0.050831,2


### Map `customer_id` and `article_id` to indices

Test set doesn't include anyone not in training set

In [11]:
unique_customers_df = train_set_orig[['customer_id']].drop_duplicates().reset_index(drop=True)

customer_id_to_idx = dict(
    zip(
        unique_customers_df['customer_id'],
        unique_customers_df.index
    )
)

len(customer_id_to_idx)

9122

In [12]:
articles_df = pd.read_csv(
    'data/articles.csv', 
    dtype={'article_id': str}, 
    usecols=['article_id', 'prod_name', 'product_type_name', 'product_group_name', 'detail_desc']
)
print(articles_df.shape)
articles_df.head()

(105542, 5)


Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,detail_desc
0,108775015,Strap top,Vest top,Garment Upper body,Jersey top with narrow shoulder straps.
1,108775044,Strap top,Vest top,Garment Upper body,Jersey top with narrow shoulder straps.
2,108775051,Strap top (1),Vest top,Garment Upper body,Jersey top with narrow shoulder straps.
3,110065001,OP T-shirt (Idro),Bra,Underwear,"Microfibre T-shirt bra with underwired, moulde..."
4,110065002,OP T-shirt (Idro),Bra,Underwear,"Microfibre T-shirt bra with underwired, moulde..."


In [13]:
articles_df = articles_df.copy()[articles_df['article_id'].isin(article_id_to_keep)]
articles_df.reset_index(drop=True, inplace=True)

In [14]:
article_id_to_idx = dict(
    zip(
        articles_df['article_id'],
        articles_df.index
    )
)

In [15]:
train_set_orig['customer_id_idx'] = train_set_orig['customer_id'].map(customer_id_to_idx)
train_set_orig['article_id_idx'] = train_set_orig['article_id'].map(article_id_to_idx)

train_set_orig.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,customer_id_idx,article_id_idx
10,2018-09-20,02d796ea767fa2e94fc6228fe70d8af1a570da973c32f7...,564358010,0.033441,2,0,1810
17,2018-09-20,02d796ea767fa2e94fc6228fe70d8af1a570da973c32f7...,575347003,0.013373,2,0,2100
27,2018-09-20,03126e45c08e82cd192b32cbddbeba63387e11e5531059...,573652001,0.033881,2,1,2072
40,2018-09-20,08e5ec1f652c85fc59be204b60f262b6d1f45e4ce11872...,573937001,0.032186,2,2,2081
43,2018-09-20,09bca14258b7e9b9ee968adffe54d45c1e370e8ed79a22...,579541001,0.016932,1,3,2175


In [16]:
train_set_orig.isnull().sum()

t_dat               0
customer_id         0
article_id          0
price               0
sales_channel_id    0
customer_id_idx     0
article_id_idx      0
dtype: int64

### Create development set

In [17]:
train_set_orig['t_dat'] = pd.to_datetime(train_set_orig['t_dat'])

In [18]:
end = train_set_orig['t_dat'].max()
endm1 = end - datetime.timedelta(days=7)

endm1, end

(Timestamp('2020-09-07 00:00:00'), Timestamp('2020-09-14 00:00:00'))

In [19]:
train_set_orig[train_set_orig['t_dat'] < endm1]['customer_id'].nunique()

9052

In [20]:
train_set_orig[train_set_orig['t_dat'] >= endm1]['customer_id'].nunique()

1998

In [21]:
# Remove customers from dev set who are not in the new training set
customers_to_remove = (
    set(train_set_orig[train_set_orig['t_dat'] >= endm1]['customer_id']) - 
    set(train_set_orig[train_set_orig['t_dat'] < endm1]['customer_id'])
)

In [22]:
len(customers_to_remove)

70

In [23]:
train_set = train_set_orig.copy()[train_set_orig['t_dat'] < endm1]

dev_set = train_set_orig.copy()[train_set_orig['t_dat'] >= endm1]

train_set.shape, dev_set.shape

((386856, 7), (8363, 7))

In [24]:
dev_set = dev_set.copy()[~dev_set['customer_id'].isin(customers_to_remove)]
print(dev_set.shape)
dev_set.head()

(8099, 7)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,customer_id_idx,article_id_idx
580395,2020-09-07,00639e775b90554156986100685c4d408723c77e411e37...,891591007,0.084729,2,5247,26239
580396,2020-09-07,00639e775b90554156986100685c4d408723c77e411e37...,891591001,0.084729,2,5247,26236
580397,2020-09-07,016f3b7c2b7652870b4b2dbdcb1805a7c8c262036f5511...,909924004,0.033881,2,3889,27666
580398,2020-09-07,016f3b7c2b7652870b4b2dbdcb1805a7c8c262036f5511...,921266005,0.016932,2,3889,28226
580399,2020-09-07,016f3b7c2b7652870b4b2dbdcb1805a7c8c262036f5511...,857163001,0.013542,2,3889,21691


In [25]:
dev_set_by_customer = dev_set.groupby('customer_id').apply(lambda x: list(x['article_id_idx'])).reset_index().rename(columns={0: 'article_id_idx'})
dev_set_by_customer.head()

Unnamed: 0,customer_id,article_id_idx
0,00462904b288681a9facb555f75dd2cf4d0f730a6e0ea7...,[27260]
1,0054c50274d19af58d53ef3ce0c004bea446c80bd51cf2...,"[10779, 1903, 10447]"
2,00639e775b90554156986100685c4d408723c77e411e37...,"[26239, 26236, 26236]"
3,00798bd464457d23d6af401715fe32d5c676ad9ee4010d...,[28006]
4,0099238196d8f71659fceaa115b36e400398bcfc169b5f...,"[23305, 23305, 616, 25352, 25352, 21835, 21835..."


In [26]:
customer_id_idx_ordered = dev_set_by_customer['customer_id'].map(customer_id_to_idx).tolist()

In [27]:
dev_actuals = dev_set_by_customer['article_id_idx'].to_list()

### Define loss function

In [None]:
labels = torch.tensor([1, 1, 0, 0])
preds = torch.tensor([0.9, 0.2, 0.3, 0.4])

In [None]:
obs_mask = torch.where(labels == 1)
obs_mask

(tensor([0, 1]),)

In [None]:
unobs_mask = torch.where(labels == 0)
unobs_mask

(tensor([2, 3]),)

In [None]:
w_0 = 2

In [None]:
obs_loss = ((labels[obs_mask] - preds[obs_mask])**2).sum()
obs_loss

tensor(0.6500)

In [None]:
unobs_loss = ((labels[unobs_mask] - preds[unobs_mask])**2).sum()
unobs_loss

tensor(0.2500)

In [None]:
total_loss = obs_loss + w_0 * unobs_loss

In [None]:
loss = total_loss / len(labels)
loss

tensor(0.2875)

In [None]:
def weighted_matrix_factorisation_loss(preds, labels, w_0=1):
    obs_mask = torch.where(labels == 1)

    unobs_mask = torch.where(labels == 0)

    obs_loss = ((labels[obs_mask] - preds[obs_mask])**2).sum()

    unobs_loss = (preds[unobs_mask]**2).sum()

    total_loss = obs_loss + w_0 * unobs_loss

    loss = total_loss / len(labels)
    
    return loss

In [None]:
weighted_matrix_factorisation_loss(preds, labels, w_0=2)

tensor(0.2875)

### Format data

Since all the users in the test said are included in training we utilise the full training set.

In [28]:
train_set.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,customer_id_idx,article_id_idx
10,2018-09-20,02d796ea767fa2e94fc6228fe70d8af1a570da973c32f7...,564358010,0.033441,2,0,1810
17,2018-09-20,02d796ea767fa2e94fc6228fe70d8af1a570da973c32f7...,575347003,0.013373,2,0,2100
27,2018-09-20,03126e45c08e82cd192b32cbddbeba63387e11e5531059...,573652001,0.033881,2,1,2072
40,2018-09-20,08e5ec1f652c85fc59be204b60f262b6d1f45e4ce11872...,573937001,0.032186,2,2,2081
43,2018-09-20,09bca14258b7e9b9ee968adffe54d45c1e370e8ed79a22...,579541001,0.016932,1,3,2175


- Keep distinct purchases only
- Discard date, price and channel columns

In [29]:
endm1

Timestamp('2020-09-07 00:00:00')

In [30]:
# train_set = train_set.copy()[train_set['t_dat'] > endm1 - datetime.timedelta(days=7*52)]

In [31]:
train_set_processed = train_set.copy()[['customer_id_idx', 'article_id_idx']].drop_duplicates(subset=['customer_id_idx', 'article_id_idx'])
train_set_processed.shape

(323603, 2)

In [32]:
train_set_processed.head()

Unnamed: 0,customer_id_idx,article_id_idx
10,0,1810
17,0,2100
27,1,2072
40,2,2081
43,3,2175


For each customer sample up to X non-purchased items to use as negative samples

In [33]:
purchases_by_customer_id_idx = train_set_processed.groupby('customer_id_idx')['article_id_idx'].apply(lambda x: list(x)).reset_index()
purchases_by_customer_id_idx.head()

Unnamed: 0,customer_id_idx,article_id_idx
0,0,"[1810, 2100, 1731, 1730, 1728, 218, 7551, 1726..."
1,1,"[2072, 4444, 8648, 10352, 5645, 2309, 2358, 19..."
2,2,"[2081, 794, 5791, 9058, 8476, 2034, 3569, 2628..."
3,3,"[2175, 3017, 2552, 878, 3055, 2905, 5516, 4359..."
4,4,"[2974, 9135, 2050, 4032, 4031, 3637, 485, 616,..."


In [34]:
articles_to_sample_from = articles_df['article_id'].map(article_id_to_idx)

In [35]:
def negative_samples(excl):
    samples = set(np.random.choice(articles_to_sample_from, replace=False, size=1000))
    neg_samples = samples - set(excl)
    return list(neg_samples)

In [36]:
# negative_samples(purchases_by_customer_id_idx.iloc[0]['article_id_idx'])

In [37]:
purchases_by_customer_id_idx.shape

(9052, 2)

In [38]:
%%time
purchases_by_customer_id_idx['negative_samples'] = purchases_by_customer_id_idx['article_id_idx'].apply(negative_samples)

CPU times: user 5.6 s, sys: 160 ms, total: 5.75 s
Wall time: 5.73 s


In [39]:
purchases_by_customer_id_idx.head()

Unnamed: 0,customer_id_idx,article_id_idx,negative_samples
0,0,"[1810, 2100, 1731, 1730, 1728, 218, 7551, 1726...","[2050, 18438, 2054, 24584, 18440, 10, 12297, 2..."
1,1,"[2072, 4444, 8648, 10352, 5645, 2309, 2358, 19...","[28673, 20482, 18438, 14344, 2058, 16395, 1434..."
2,2,"[2081, 794, 5791, 9058, 8476, 2034, 3569, 2628...","[6144, 6146, 8195, 6148, 20484, 20486, 6151, 2..."
3,3,"[2175, 3017, 2552, 878, 3055, 2905, 5516, 4359...","[16386, 24579, 16389, 16390, 22534, 18440, 615..."
4,4,"[2974, 9135, 2050, 4032, 4031, 3637, 485, 616,...","[24579, 12291, 26629, 2056, 24586, 10253, 2663..."


In [40]:
purchases_by_customer_id_idx['negative_samples'].apply(len).value_counts()

1000    3978
999     2386
998     1233
997      664
996      340
995      173
994      104
993       67
992       36
991       21
990       20
988        8
986        5
989        5
984        3
987        3
985        2
982        2
979        1
981        1
Name: negative_samples, dtype: int64

In [41]:
negative_samples_df = purchases_by_customer_id_idx[['customer_id_idx', 'negative_samples']].explode('negative_samples')
negative_samples_df.head()

Unnamed: 0,customer_id_idx,negative_samples
0,0,2050
0,0,18438
0,0,2054
0,0,24584
0,0,18440


In [42]:
negative_samples_df.isnull().sum()

customer_id_idx     0
negative_samples    0
dtype: int64

In [43]:
negative_samples_df.dropna(subset=['negative_samples'], inplace=True)

In [44]:
negative_samples_df.rename(columns={'negative_samples': 'article_id_idx'}, inplace=True)
negative_samples_df['purchased'] = 0

In [45]:
negative_samples_df.head()

Unnamed: 0,customer_id_idx,article_id_idx,purchased
0,0,2050,0
0,0,18438,0
0,0,2054,0
0,0,24584,0
0,0,18440,0


In [46]:
positive_samples_df = train_set_processed.copy()[['customer_id_idx', 'article_id_idx']]
positive_samples_df['purchased'] = 1

positive_samples_df.head()

Unnamed: 0,customer_id_idx,article_id_idx,purchased
10,0,1810,1
17,0,2100,1
27,1,2072,1
40,2,2081,1
43,3,2175,1


In [47]:
training_data_full = positive_samples_df.append(negative_samples_df)
training_data_full.shape

(9364350, 3)

In [48]:
training_data_full['purchased'].mean()

0.034556909983074105

Can shuffle in `DataLoader`

In [49]:
training_data_full.head()

Unnamed: 0,customer_id_idx,article_id_idx,purchased
10,0,1810,1
17,0,2100,1
27,1,2072,1
40,2,2081,1
43,3,2175,1


In [50]:
class PurchasesDataset(Dataset):

    def __init__(self):
        self.customer_id_idx = training_data_full['customer_id_idx'].values
        self.article_id_idx = training_data_full['article_id_idx'].values
        self.purchased = training_data_full['purchased'].values

    def __len__(self):
        return self.customer_id_idx.shape[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        return self.customer_id_idx[idx], self.article_id_idx[idx], self.purchased[idx]

In [76]:
class MatrixFactorisation(nn.Module):

    def __init__(self):
        super(MatrixFactorisation, self).__init__()
        
        self.user_embeddings = nn.Embedding(num_embeddings=len(customer_id_to_idx), embedding_dim=256)
        
        self.item_embeddings = nn.Embedding(num_embeddings=len(article_id_to_idx), embedding_dim=256)

        torch.nn.init.xavier_uniform_(self.user_embeddings.weight)
        torch.nn.init.xavier_uniform_(self.item_embeddings.weight)

    def forward(self, user_id, item_id):
        
        user_emb = self.user_embeddings(user_id)
        item_emb = self.item_embeddings(item_id)

        user_emb = nn.Dropout(0.4)(user_emb)
        item_emb = nn.Dropout(0.4)(item_emb)
        
        scores = torch.diagonal(
            torch.matmul(user_emb, item_emb.T)
        )

        scores = nn.Sigmoid()(scores)
        
        return scores

In [77]:
matrix_fact_model = MatrixFactorisation()

In [78]:
matrix_fact_model.to(device)

MatrixFactorisation(
  (user_embeddings): Embedding(9122, 256)
  (item_embeddings): Embedding(28709, 256)
)

In [79]:
train_dataset = PurchasesDataset()

In [80]:
BATCH_SIZE = 2048

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)  

In [81]:
# # Selecting a subset of data
# subset = torch.utils.data.Subset(train_dataset, [1])

# train_loader = DataLoader(subset, batch_size=BATCH_SIZE)  

In [82]:
optimizer = torch.optim.Adam(params=matrix_fact_model.parameters(), lr=0.001)

Switched to BCELoss instead of weighted MSE. Had more success with this.

In [83]:
criterion = nn.BCELoss()

In [84]:
MAX_EPOCHS = 10
W_0 = 1
training_losses = []
dev_maps = []

for epoch in range(MAX_EPOCHS):
    
    for data in tqdm(train_loader):
        
        user_id, item_id, label = data
        
        user_id, item_id, label = user_id.to(device), item_id.to(device), label.to(device)

        optimizer.zero_grad()  # Set gradients to 0 otherwise will accumulate

        pred = matrix_fact_model(user_id, item_id)
        
        # loss = weighted_matrix_factorisation_loss(pred, label, w_0=W_0)

        loss = criterion(pred, label.float())

        loss.backward()  
        
        optimizer.step()
        
    # Compute training loss
    
    total_train_loss = 0

    matrix_fact_model.eval()

    with torch.no_grad():
        for data in DataLoader(train_dataset, batch_size=BATCH_SIZE):
            
            user_id, item_id, label = data
        
            user_id, item_id, label = user_id.to(device), item_id.to(device), label.to(device)

            optimizer.zero_grad()  # Set gradients to 0 otherwise will accumulate

            pred = matrix_fact_model(user_id, item_id)

            # loss = weighted_matrix_factorisation_loss(pred, label, w_0=W_0)

            loss = nn.BCELoss(reduction='sum')(pred, label.float())

            total_train_loss += loss.item()

        mean_train_loss = total_train_loss / len(train_dataset)

        print(f"Training loss: {mean_train_loss}")
        training_losses.append(mean_train_loss)
    
    # Compute dev MAP@12

    pred = torch.matmul(matrix_fact_model.user_embeddings.weight, matrix_fact_model.item_embeddings.weight.T)

    predictions = torch.topk(pred, 1000, dim=1).indices.tolist()

    dev_map12 = mapk(dev_actuals, predictions, k=12)

    dev_precision1k = np.mean(
        [len(set(pred).intersection(set(act))) / 1000 for pred, act in zip(predictions, dev_actuals)]
    )

    print(f"MAP@12: {dev_map12}")
    dev_maps.append(dev_map12)

    print(f"Prec@1k: {dev_precision1k}")

    matrix_fact_model.train()

100%|██████████| 4573/4573 [00:39<00:00, 116.69it/s]


Training loss: 0.11877576458834571
MAP@12: 0.0017025165456182052
Prec@1k: 0.0003864107883817428


100%|██████████| 4573/4573 [00:39<00:00, 116.34it/s]


Training loss: 0.11569844325958906
MAP@12: 0.0017957032162167015
Prec@1k: 0.00038485477178423235


100%|██████████| 4573/4573 [00:39<00:00, 116.89it/s]


Training loss: 0.10981509478999138
MAP@12: 0.0013492213181009861
Prec@1k: 0.0003843360995850623


100%|██████████| 4573/4573 [00:39<00:00, 116.96it/s]


Training loss: 0.1016217784511349
MAP@12: 0.0014295719090967063
Prec@1k: 0.00037033195020746886


100%|██████████| 4573/4573 [00:39<00:00, 116.87it/s]


Training loss: 0.09257814446576813
MAP@12: 0.001156466193606828
Prec@1k: 0.00036410788381742735


100%|██████████| 4573/4573 [00:39<00:00, 116.93it/s]


Training loss: 0.08431730556957269
MAP@12: 0.0013135688414474722
Prec@1k: 0.0003516597510373444


100%|██████████| 4573/4573 [00:39<00:00, 115.81it/s]


Training loss: 0.07711818614301344
MAP@12: 0.001242371744372337
Prec@1k: 0.00033973029045643153


100%|██████████| 4573/4573 [00:39<00:00, 116.80it/s]


Training loss: 0.07084172991771188
MAP@12: 0.001172223910380895
Prec@1k: 0.00032987551867219925


100%|██████████| 4573/4573 [00:39<00:00, 116.02it/s]


Training loss: 0.06581653517150222
MAP@12: 0.0007288348116185994
Prec@1k: 0.0003226141078838175


100%|██████████| 4573/4573 [00:39<00:00, 117.17it/s]


Training loss: 0.0615676522075293
MAP@12: 0.0008767354354318393
Prec@1k: 0.00032365145228215774


### Idea: Use purchase counts rather than unique purchases and frame as regression

### For a given item look at most similar items

In [None]:
matrix_fact_model.user_embeddings.weight

Parameter containing:
tensor([[-2.6022e+00, -5.8713e-01, -8.6654e+00,  ..., -8.3222e+00,
          5.2864e+00,  1.1209e+01],
        [-1.6626e+00,  1.0944e+00, -5.3767e+00,  ...,  1.7319e+00,
         -3.2302e+00,  5.2481e+00],
        [ 9.2689e+00,  1.5309e+00, -7.1217e-01,  ..., -1.5036e+00,
          6.8088e+00, -2.6902e+00],
        ...,
        [ 2.9843e-03, -3.1525e-03,  1.4968e-02,  ..., -1.0128e-03,
         -8.8798e-03, -1.3996e-02],
        [-2.0700e-02,  3.0138e-03, -1.3882e-02,  ..., -4.1250e-03,
         -2.0305e-02,  1.6165e-02],
        [-1.1830e-02,  2.2714e-02, -1.3435e-02,  ..., -8.6281e-04,
          7.2083e-03,  4.0233e-03]], device='cuda:0', requires_grad=True)

In [None]:
articles_df.head()

Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,detail_desc
0,108775015,Strap top,Vest top,Garment Upper body,Jersey top with narrow shoulder straps.
1,108775044,Strap top,Vest top,Garment Upper body,Jersey top with narrow shoulder straps.
2,110065002,OP T-shirt (Idro),Bra,Underwear,"Microfibre T-shirt bra with underwired, moulde..."
3,111565001,20 den 1p Stockings,Underwear Tights,Socks & Tights,"Semi shiny nylon stockings with a wide, reinfo..."
4,111586001,Shape Up 30 den 1p Tights,Leggings/Tights,Garment Lower body,Tights with built-in support to lift the botto...


In [None]:
articles_df['article_id_idx'] = articles_df['article_id'].map(article_id_to_idx)

In [None]:
np.random.seed(3)
sample_article_id_idx = articles_df.sample(n=1)['article_id_idx'].item()
sample_article_id_idx

337

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
%%time
sim = cosine_similarity(
    matrix_fact_model.item_embeddings.weight.cpu().detach().numpy()[sample_article_id_idx].reshape(1, -1), 
    matrix_fact_model.item_embeddings.weight.cpu().detach().numpy()
)
print(sim)

[[0.30767092 0.21928097 0.36967278 ... 0.32739282 0.34107    0.43882087]]
CPU times: user 28 ms, sys: 21.3 ms, total: 49.3 ms
Wall time: 34.4 ms


In [None]:
cos = nn.CosineSimilarity(dim=1, eps=1e-6)

In [None]:
%%time
cos(matrix_fact_model.item_embeddings.weight[sample_article_id_idx], matrix_fact_model.item_embeddings.weight)

CPU times: user 1.64 ms, sys: 18 µs, total: 1.65 ms
Wall time: 1.56 ms


tensor([0.3077, 0.2193, 0.3697,  ..., 0.3274, 0.3411, 0.4388], device='cuda:0',
       grad_fn=<DivBackward0>)

In [None]:
cos = nn.CosineSimilarity(dim=0, eps=1e-6)

In [None]:
%%time
sim_mat = cos(matrix_fact_model.item_embeddings.weight, matrix_fact_model.item_embeddings.weight)

CPU times: user 2.06 ms, sys: 27 µs, total: 2.09 ms
Wall time: 4.91 ms


In [None]:
sim_mat.shape

torch.Size([100])

https://stackoverflow.com/questions/50411191/how-to-compute-the-cosine-similarity-in-pytorch-for-all-rows-in-a-matrix-with-re

In [None]:
a = matrix_fact_model.item_embeddings.weight
b = matrix_fact_model.item_embeddings.weight

# Given that cos_sim(u, v) = dot(u, v) / (norm(u) * norm(v))
#                          = dot(u / norm(u), v / norm(v))
# We fist normalize the rows, before computing their dot products via transposition:
a_norm = a / a.norm(dim=1)[:, None]
b_norm = b / b.norm(dim=1)[:, None]
res = torch.mm(a_norm, b_norm.transpose(0,1))

In [None]:
res.shape

torch.Size([28709, 28709])

In [None]:
res[sample_article_id_idx, :]

tensor([0.3077, 0.2193, 0.3697,  ..., 0.3274, 0.3411, 0.4388], device='cuda:0',
       grad_fn=<SliceBackward0>)

### TODO

- Increase size of embeddings
- Frame as regression with purchase counts as labels
- Candidate generation using similarity of mean historical basket to item embeddings (i.e. related-item)
- Candidate generation taking CF output (user-item dot product)
- Double check details of approach e.g. negative sampling strategy
- If CF fails can still try softmax model but with historical purchases as inputs (looong)

In [None]:
k = 10

closest_k = np.flip(np.argsort(sim)).flatten()[:k]
closest_k

array([75601, 52092, 64906, 94384, 53663, 69706, 42360, 35740, 55846,
       21771])

In [None]:
articles_df.iloc[closest_k]

Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,detail_desc,article_id_idx
75601,782138001,Lea PQ highshaft,Boots,Shoes,"Suede boots with gently rounded toes, soft, su...",75601
52092,700586001,SKINNY EMILY,Trousers,Garment Lower body,"Slim-fit jeans in washed, superstretch, flexib...",52092
64906,743218008,Bracelet Lincoln Italy,Bracelet,Accessories,Multi-strand bracelet in imitation leather wit...,64906
94384,862219003,Parrot cropped trousers,Trousers,Garment Lower body,Ankle-length suit trousers in woven fabric wit...,94384
53663,705382001,Speed Caramella earring,Earring,Accessories,Long metal earrings with plastic pendants in v...,53663
69706,757915001,Swish Super Push,Bikini top,Swimwear,"Bikini top with underwired, gathered, thickly ...",69706
42360,672460006,MARS Treggings,Trousers,Garment Lower body,Treggings in stretch cotton twill with an elas...,42360
35740,648254001,Bobby,Skirt,Garment Lower body,Calf-length skirt in a patterned crêpe weave w...,35740
55846,711846002,Noa mesh bag,Bag,Accessories,Bag in cotton mesh with round plastic handles ...,55846
21771,595318002,Flirty pompom terry pk,Hair string,Accessories,Hair elastics decorated with pompoms.,21771


### Look at recommendations for a given user

In [None]:
sample_customer = train_set_processed.sample(n=1)['customer_id_idx'].item()
sample_customer

7266

In [None]:
pred = torch.matmul(matrix_fact_model.user_embeddings.weight[sample_customer, :], matrix_fact_model.item_embeddings.weight.T)

In [None]:
closest_k = torch.topk(pred, k=10).indices
closest_k

tensor([ 85106, 101366, 101220,  77601,  25806,   2236,  88346,  74290,  89628,
         85223], device='cuda:0')

In [None]:
# Previous purchases
train_set_processed[train_set_processed['customer_id_idx'] == sample_customer].merge(
    articles_df,
    on='article_id_idx'
)

Unnamed: 0,customer_id_idx,article_id_idx,article_id,prod_name,product_type_name,product_group_name,detail_desc
0,7266,77253,788575001,Maja cargo Slim HW Denim,Trousers,Garment Lower body,"Jeans in washed, stretch denim with a high wai..."
1,7266,81405,805525001,Nottingham trucker,Jacket,Garment Upper body,"Short, boxy jacket in cotton twill with a coll..."
2,7266,73048,770315005,Alpha essential top,Top,Garment Upper body,"Top in airy jersey crêpe with a V-neck, short ..."
3,7266,40321,664074077,Charlie Top,Top,Garment Upper body,Straight-cut top in airy jersey crêpe with a b...
4,7266,85106,818024001,Toulon jumper,Sweater,Garment Upper body,"Wide top in a stretchy, fine knit with a light..."
5,7266,87171,826492006,Dame,Sweater,Garment Upper body,"Boxy-style jumper in a soft, fine knit contain..."
6,7266,17129,572998005,Beverly HW Loose Mom Fit Dnm,Trousers,Garment Lower body,"5-pocket, ankle-length jeans in washed denim w..."
7,7266,87358,827411001,Chubba Chubb Highwaist brazili,Swimwear bottom,Swimwear,"Fully lined, waist-high bikini bottoms with ga..."
8,7266,45795,684209013,Simple as That Triangle Top,Bikini top,Swimwear,"Lined, non-wired, triangle bikini top with a w..."
9,7266,45802,684209027,Simple as That Triangle Top,Bikini top,Swimwear,"Lined, non-wired, triangle bikini top with a w..."


In [None]:
articles_df.iloc[closest_k.cpu().detach().numpy()]

Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,detail_desc,article_id_idx
85106,818024001,Toulon jumper,Sweater,Garment Upper body,"Wide top in a stretchy, fine knit with a light...",85106
101366,896152001,Amelie,T-shirt,Garment Upper body,"Top in a soft, fine knit containing some wool ...",101366
101220,895002002,Steam smock top,Top,Garment Upper body,Short top in crinkled jersey with a smocked bo...,101220
77601,790368001,Pantha PU leggings,Trousers,Garment Lower body,Leggings in stretch fabric. High waist with co...,77601
25806,614854013,Fiona brazilian (Acacia) 4p,Underwear bottom,Underwear,Brazilian briefs in soft jersey with lace trim...,25806
2236,399223001,Curvy Jeggings HW Ankle,Trousers,Garment Lower body,"Jeggings in washed, superstretch denim with a ...",2236
88346,832473005,Asa smock top,Top,Garment Upper body,Tube top in smocked cotton jersey with a frill...,88346
74290,776237011,Shake it in Balconette,Bikini top,Swimwear,"Lined balconette bikini top with underwired, p...",74290
89628,839227001,Big satin scrunchie,Hair string,Accessories,Elasticated scrunchie covered in satin with a ...,89628
85223,818614007,Samantha seamless bandeau,Bra,Underwear,Strapless bandeau bra in jersey with padded cu...,85223


### Test set evaluation

In [None]:
pred = torch.matmul(matrix_fact_model.user_embeddings.weight, matrix_fact_model.item_embeddings.weight.T)

In [None]:
pred.shape

torch.Size([9208, 105542])

In [None]:
pred

tensor([[-7.3850e+02, -5.8736e+02, -3.0175e+02,  ..., -3.2758e+02,
         -6.7557e+02, -2.4729e+02],
        [-7.9312e+02, -7.1850e+02, -2.0791e+02,  ..., -7.2305e+02,
         -9.8646e+02, -4.3774e+02],
        [-6.6568e+02, -6.7205e+02, -6.9188e+02,  ..., -7.9726e+02,
         -1.0750e+03, -3.5086e+02],
        ...,
        [-5.5384e-01,  6.9027e-01, -1.2639e-01,  ..., -2.6822e-01,
         -5.3654e-01, -7.8011e-02],
        [ 5.4282e-01, -2.5723e-01, -1.4271e-01,  ..., -1.9621e-02,
         -3.0582e-02, -8.6693e-02],
        [ 3.7253e-01,  1.0631e+00,  4.7211e-01,  ...,  3.7026e-01,
          3.5100e-01,  1.9151e-02]], device='cuda:0', grad_fn=<MmBackward0>)

In [None]:
%%time
predictions = torch.topk(pred, 12, dim=1).indices.tolist()

CPU times: user 147 ms, sys: 989 µs, total: 148 ms
Wall time: 148 ms


In [None]:
test_set['article_id_idx'] = test_set['article_id'].map(article_id_to_idx)

In [None]:
test_set_by_customer = test_set.groupby('customer_id').apply(lambda x: list(x['article_id_idx'])).reset_index().rename(columns={0: 'article_id_idx'})
test_set_by_customer.head()

Unnamed: 0,customer_id,article_id_idx
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,[78503]
1,0015f37f752a41a75c3be6f3f92deedc4c87d039f1758e...,"[74893, 74893, 24872, 104987, 9801, 9801, 6912..."
2,00282135561702f5b3b750fa3382d8fd83ce5d761a507e...,"[92815, 86041]"
3,00356a94bb9bed341f6dba58ad722974b01a1cbd9f06ef...,"[104211, 17044, 17044, 16024, 29250, 98445, 66..."
4,00462904b288681a9facb555f75dd2cf4d0f730a6e0ea7...,"[87467, 99966, 42626, 79834, 97540, 103280, 79..."


In [None]:
customers_ordered = test_set_by_customer['customer_id'].map(customer_id_to_idx).tolist()

In [None]:
actuals = test_set_by_customer['article_id_idx'].to_list()

In [None]:
mapk(actuals, predictions, k=12)

0.0005853456357773256

CF doesn't do better than simple heuristic (top 12 most popular). Possibly explains why earlier attempts at training embeddings failed.