In [1]:
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from scipy import sparse
import numpy as np
from torch import nn
from torch.nn import functional as F
import faiss
from metric import mapk
import pandas as pd
import schema
from utils import extract_transactions_train, extract_transactions_valid
import datetime
from logzero import logger

In [2]:
class BPRDataset(Dataset):
    def __init__(self, n_user: int, n_item: int, transactions: np.ndarray):
        """
        Parameters
        ----------
        n_user
            number of users
        n_item
            number of items
        transactions
            (n_transactions, 2) 2d array
        """
        self.n_user = n_user
        self.n_item = n_item
        self.transactions = transactions
        self.transactions_matrix = sparse.lil_matrix((self.n_user, self.n_item), dtype='int')
        self.transactions_matrix[transactions[:,0], transactions[:,1]] = 1

    def __len__(self):
        return len(self.transactions)

    def __getitem__(self, idx):
        user, pos_item = self.transactions[idx]
        while True:
            neg_item = np.random.randint(0, self.n_item)
            if self.transactions_matrix[user, neg_item] == 0:
                break
        return torch.tensor(user), torch.tensor(pos_item), torch.tensor(neg_item)

In [3]:
class BPRModel(nn.Module):
    def __init__(self, user_features: np.ndarray, item_features: np.ndarray, embedding_dim: int):
        super(BPRModel, self).__init__()
        self.n_user = len(user_features)
        self.n_item = len(item_features)
        self.user_features = torch.tensor(user_features, dtype=torch.float32).cuda()
        self.item_features = torch.tensor(item_features, dtype=torch.float32).cuda()

        self.embedding_dim = embedding_dim

        user_feature_dim = user_features.shape[1]
        item_feature_dim = item_features.shape[1]

        USER_EMB = 128
        ITEM_EMB = 128
        self.user_embedding = nn.Embedding(self.n_user, USER_EMB)
        self.item_embedding = nn.Embedding(self.n_item, ITEM_EMB)

        self.user_fc = nn.Sequential(
            nn.Linear(USER_EMB + user_feature_dim, embedding_dim),
        )
        self.item_fc = nn.Sequential(
            nn.Linear(ITEM_EMB + item_feature_dim, embedding_dim),
        )

    def forward(self, users, pos_items, neg_items):
        x_users = self.forward_user(users)
        x_pos_items = self.forward_item(pos_items)
        x_neg_items = self.forward_item(neg_items)

        pos = (x_users * x_pos_items).sum(dim=1)
        neg = (x_users * x_neg_items).sum(dim=1)
        return pos, neg

    def forward_user(self, users):
        user_emb = self.user_embedding(users)
        return self.user_fc(torch.cat([user_emb, self.user_features[users]], dim=1))

    def forward_item(self, items):
        item_emb = self.item_embedding(items)
        return self.item_fc(torch.cat([item_emb, self.item_features[items]], dim=1))

In [4]:
transactions = pd.read_pickle('input/transformed/transactions_train.pkl')[schema.TRANSACTIONS]
articles = pd.read_pickle('input/transformed/articles.pkl')[schema.ARTICLES]
customers = pd.read_pickle('input/transformed/customers.pkl')[schema.CUSTOMERS]

transactions_valid = extract_transactions_valid(transactions, datetime.date(2020, 9, 16))
transactions_train = extract_transactions_train(transactions, datetime.date(2020, 9, 16), 7)

[I 220306 22:01:20 utils:14] valid: [2020-09-16, 2020-09-23)
[I 220306 22:01:20 utils:16] # of records: 240311
[I 220306 22:01:20 utils:27] train: [2020-09-09, 2020-09-16)
[I 220306 22:01:20 utils:29] # of records: 255241


In [5]:
val = transactions_valid.groupby('customer_id_idx')['article_id_idx'].apply(list).reset_index()

In [6]:
def create_customer_features(customers):
    df = customers.copy().drop('customer_id_idx', axis=1)
    df['age_is_null'] = df['age'].isnull().astype(int)
    df['age'] = df['age'].fillna(df['age'].mean())
    for c in ['club_member_status_idx', 'fashion_news_frequency_idx']:
        df = pd.concat([df, pd.get_dummies(df[c], prefix=c)], axis=1).drop(c, axis=1)
    return df.values

def create_article_features(articles):
    df = articles.copy().drop('article_id_idx', axis=1)
    for c in df.columns:
        df = pd.concat([df, pd.get_dummies(df[c], prefix=c)], axis=1).drop(c, axis=1)
    return df.values

user_features = create_customer_features(customers)
item_features = create_article_features(articles)
n_user = len(user_features)
n_item = len(item_features)

In [7]:
dataset = BPRDataset(n_user, n_item, transactions_train[['customer_id_idx', 'article_id_idx']].values)
model = BPRModel(user_features, item_features, 256).cuda()

In [8]:
def calc_user_representations(model):
    users = torch.from_numpy(np.arange(model.n_user))
    user_dataset = TensorDataset(users)
    user_loader = DataLoader(user_dataset, batch_size=256, shuffle=False)
    representations = []
    for x in user_loader:
        x = x[0].cuda()
        representations.append(model.forward_user(x).cpu().detach().numpy())
    return np.vstack(representations)


def calc_item_representations(model):
    items = torch.from_numpy(np.arange(model.n_item))
    item_dataset = TensorDataset(items)
    item_loader = DataLoader(item_dataset, batch_size=256, shuffle=False)
    representations = []
    for x in item_loader:
        x = x[0].cuda()
        representations.append(model.forward_item(x).cpu().detach().numpy())
    return np.vstack(representations)


In [9]:
dataloader = DataLoader(dataset, batch_size=128, shuffle=True, drop_last=True, num_workers=4)

model = model.cuda()
optimizer = torch.optim.Adam(model.parameters())

def loss_fn(pos_output, neg_output):
    return -(pos_output - neg_output).sigmoid().log().mean()

for _ in range(1000):
    model.train()
    losses = []
    for users, items_pos, items_neg in dataloader:
        users, items_pos, items_neg = users.cuda(), items_pos.cuda(), items_neg.cuda()

        pos_output, neg_output = model(users, items_pos, items_neg)
        loss = loss_fn(pos_output, neg_output)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        losses.append(loss.cpu().detach().item())
    logger.info(f"loss: {np.mean(losses)}")

    model.eval()
    user_representations = calc_user_representations(model)
    item_representations = calc_item_representations(model)

    index = faiss.index_factory(model.embedding_dim, "Flat", faiss.METRIC_INNER_PRODUCT)
    index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, index)
    index.add(item_representations)
    _, idxs = index.search(user_representations, 12)

    logger.info(f"map: {mapk(val.article_id_idx, idxs[val.customer_id_idx])}")

[I 220306 22:03:11 4263974808:23] loss: 0.61825029340497
[I 220306 22:03:28 4263974808:34] map: 0.0003894452399743309
[I 220306 22:05:13 4263974808:23] loss: 0.330833975689013
[I 220306 22:05:30 4263974808:34] map: 0.0014804654942221897
[I 220306 22:07:15 4263974808:23] loss: 0.2529191033113206
[I 220306 22:07:32 4263974808:34] map: 0.0025998038703893725
[I 220306 22:09:12 4263974808:23] loss: 0.22053255628020976
[I 220306 22:09:28 4263974808:34] map: 0.0032219052313018676
[I 220306 22:11:08 4263974808:23] loss: 0.20087394573181167
[I 220306 22:11:23 4263974808:34] map: 0.0025355163628633342
[I 220306 22:13:04 4263974808:23] loss: 0.188961113506504
[I 220306 22:13:19 4263974808:34] map: 0.0038136364482528004
[I 220306 22:15:00 4263974808:23] loss: 0.19098171355795845
[I 220306 22:15:15 4263974808:34] map: 0.004024329668795174
[I 220306 22:16:55 4263974808:23] loss: 0.18410418204656814
[I 220306 22:17:11 4263974808:34] map: 0.0028141686376750093
[I 220306 22:18:51 4263974808:23] loss: 0

KeyboardInterrupt: 