In [1]:
from collections import defaultdict
import time
import os
import random
import requests
import tqdm

from IPython.core.debugger import set_trace
import numpy as np
import pandas as pd
from ranger import Ranger
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.nn.functional as F 
import torch.utils.data as td
import torch.optim as to

In [2]:
# Configuration
device= 'cpu'
# The directory to store the data
data_dir = "data"

train_rating = "ml-1m.train.rating"
test_negative = "ml-1m.test.negative"

train_negative_samples = 4
test_negative_samples = 99
embedding_dim = 64
hidden_dim = 128 # 32
N = 10 # memory size for state_repr

# Training config
batch_size = 16 # 512
top_k=10

## Data


Use Movielens 1M data from the https://github.com/hexiangnan/neural_collaborative_filtering

In [3]:
if not os.path.isdir('./data'):
    os.mkdir('./data')
    
for file_name in [train_rating, test_negative]:
    file_path = os.path.join(data_dir, file_name)
    if os.path.exists(file_path):
        print("Skip loading " + file_name)
        continue
    with open(file_path, "wb") as tf:
        print("Load " + file_name)
        r = requests.get("https://raw.githubusercontent.com/hexiangnan/neural_collaborative_filtering/master/Data/" + file_name)
        tf.write(r.content)

Skip loading ml-1m.train.rating
Skip loading ml-1m.test.negative


In [4]:
def preprocess_train():
    train_data = pd.read_csv(os.path.join(data_dir, train_rating), sep='\t', header=None, 
                             names=['user', 'item', 'rating'], usecols=[0, 1, 2], 
                             dtype={0: np.int32, 1: np.int32, 2: np.int8})
    
    train_data = train_data[train_data['rating'] > 3][['user', 'item']]
    user_num = train_data['user'].max() + 1
    item_num = train_data['item'].max() + 1

    mat = defaultdict(int)
    train_data = train_data.values.tolist()
    for user, item in train_data:
        mat[user, item] = 1.0
        
    # Convert ratings as a dok matrix
    train_mat = sp.dok_matrix((user_num, item_num), dtype=np.float32)
    dict.update(train_mat, mat)
    
    return train_data, train_mat, user_num, item_num

train_data, train_mat, user_num, item_num = preprocess_train()

In [5]:
def preprocess_test():
    test_data = []
    with open(os.path.join(data_dir, test_negative)) as tnf:
        for line in tnf:
            parts = line.split('\t')
            assert len(parts) == test_negative_samples + 1
            user, positive = eval(parts[0])
            test_data.append([user, positive])
            
            for negative in parts[1:]:
                test_data.append([user, int(negative)])

    return test_data

valid_data = preprocess_test()

In [6]:
class MLDataset(td.Dataset):
    
    def __init__(self, positive_data, item_num, positive_mat, negative_samples=99):
        super(MLDataset, self).__init__()
        self.positive_data = positive_data
        self.item_num = item_num
        self.positive_mat = positive_mat
        self.negative_samples = negative_samples
        
        self.reset()
        
    def reset(self):
        print("Resetting dataset")
        if self.negative_samples > 0:
            negative_data = self.sample_negatives()
            data = self.positive_data + negative_data
            labels = [1] * len(self.positive_data) + [0] * len(negative_data)
        else:
            data = self.positive_data
            labels = [0] * len(self.positive_data)
            
        self.data = np.concatenate([
            np.array(data), 
            np.array(labels)[:, np.newaxis]], 
            axis=1
        )

    def sample_negatives(self):
        negative_data = []
        for user, positive in self.positive_data:
            for _ in range(self.negative_samples):
                negative = np.random.randint(self.item_num)
                while (user, negative) in self.positive_mat:
                    negative = np.random.randint(self.item_num)
                    
                negative_data.append([user, negative])

        return negative_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        user, item, label = self.data[idx]
        output = {
            "user": user,
            "item": item,
            "label": np.float32(label),
        }
        return output

class SamplerWithReset(td.RandomSampler):
    def __iter__(self):
        self.data_source.reset()
        return super().__iter__()

In [7]:
train_dataset = MLDataset(
    train_data, 
    item_num, 
    train_mat, 
    train_negative_samples
)
train_loader = td.DataLoader(
    train_dataset, 
    batch_size=1, 
    shuffle=False,
    num_workers=8,
    sampler=SamplerWithReset(train_dataset)
)

valid_dataset = MLDataset(valid_data, item_num, train_mat)
valid_loader = td.DataLoader(
    valid_dataset, 
    batch_size=test_negative_samples+1, 
    shuffle=False, 
    num_workers=0
)

Resetting dataset
Resetting dataset


In [8]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
    
    def push(self, state, action, reward, next_state):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state)
        self.position = (self.position + 1) % self.capacity
    
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state = map(np.stack, zip(*batch))
        return state, action, reward, next_state
    
    def __len__(self):
        return len(self.buffer)

In [9]:
def update_item_representation(memory, user, new_item, label):
    if label:
        memory[user] = list(memory[user, 1:]) + [new_item]

## Model

In [10]:
class State_Repr_Module(nn.Module):
    def __init__(self, user_num, item_num, embedding_dim, hidden_dim):
        super().__init__()
        self.user_embeddings = nn.Embedding(user_num, embedding_dim)
        self.item_embeddings = nn.Embedding(item_num, embedding_dim)
        self.drr_ave = nn.AdaptiveAvgPool1d(1)
        
        self.initialize()
        
    def initialize(self):
        nn.init.xavier_uniform_(self.user_embeddings.weight)
        nn.init.xavier_uniform_(self.item_embeddings.weight)
        
    def forward(self, user, item, memory):
        user_embedding = self.user_embeddings(user)
        item_embedding = self.item_embeddings(item)

        item_embeddings = []
        for u, i in zip(user, item):
            subitem_embeddings = []
            for item_i in memory[u.cpu().numpy()]:
                if item_i == -1:
                    subitem_embeddings.append(torch.zeros(embedding_dim).to(device))
                else:
                    subitem_embeddings.append(self.item_embeddings(torch.tensor(int(item_i)).to(device)))
            item_embeddings.append(torch.stack(subitem_embeddings))

        drr_ave = self.drr_ave(torch.stack(item_embeddings).permute((0, 2, 1))).squeeze(-1)
        
        return torch.cat((user_embedding, user_embedding * drr_ave, drr_ave), 1)

In [11]:
class Actor_DRR(nn.Module):
    def __init__(self, user_num, item_num, embedding_dim, hidden_dim):
        super().__init__()
    
        self.layers = nn.Sequential(
            nn.Linear(embedding_dim * 3, hidden_dim),
            nn.ReLU(),
#             nn.Linear(hidden_dim, hidden_dim),
#             nn.ReLU(),
            nn.Linear(hidden_dim, embedding_dim)
        )
        
        self.initialize()

    def initialize(self):
        for layer in self.layers:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)
                layer.bias.data.zero_()
            
    def forward(self, state):
        action_embedding = torch.tanh(self.layers(state))
        return action_embedding
    
    def get_action(self, state, state_repr, items=torch.tensor([i for i in range(item_num)])):
        pred = torch.bmm(
            state_repr.item_embeddings(items).unsqueeze(0), 
            self.forward(state).T.unsqueeze(0)
        )
        return pred.squeeze(0).argmax(0)

In [12]:
class Critic_DRR(nn.Module):
    def __init__(self, state_repr_dim, action_emb_dim, hidden_dim):
        super().__init__()
        
        self.linear1 = nn.Linear(state_repr_dim + action_emb_dim, hidden_dim)
#         self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, 1)
        
        self.initialize()
        
    def initialize(self, init_w=3e-3):
        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)
        
    def forward(self, state, action):
        x = torch.cat([state, action], 1)
        x = F.relu(self.linear1(x))
#         x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x

## Training

In [14]:
state_repr = State_Repr_Module(user_num, item_num, embedding_dim, hidden_dim)
value_net =  Critic_DRR(embedding_dim * 3, embedding_dim, hidden_dim).to(device)
policy_net  =  Actor_DRR(user_num, item_num, embedding_dim, hidden_dim).to(device)

target_value_net =  Critic_DRR(embedding_dim * 3, embedding_dim, hidden_dim).to(device)
target_policy_net  = Actor_DRR(user_num, item_num, embedding_dim, hidden_dim).to(device).to(device)

for target_param, param in zip(target_value_net.parameters(), value_net.parameters()):
    target_param.data.copy_(param.data)

for target_param, param in zip(target_policy_net.parameters(), policy_net.parameters()):
    target_param.data.copy_(param.data)

value_criterion = nn.MSELoss()
value_optimizer      = Ranger(value_net.parameters(),  lr=1e-4)
policy_optimizer     = Ranger(policy_net.parameters(), lr=1e-4)

In [14]:
def ddpg_update(batch_size=16, 
                gamma = 0.6,
                min_value=-np.inf,
                max_value=np.inf,
                soft_tau=1e-2):
    
    state, action, reward, next_state = replay_buffer.sample(batch_size)
    state      = torch.FloatTensor(state).to(device)
    next_state = torch.FloatTensor(next_state).to(device)
    action     = torch.FloatTensor(action).to(device)
    reward     = torch.FloatTensor(reward).to(device)

    policy_loss = value_net(state, policy_net(state))
    policy_loss = -policy_loss.mean()

    next_action    = target_policy_net(next_state)
    target_value   = target_value_net(next_state, next_action.detach())
    expected_value = reward + gamma * target_value
    expected_value = torch.clamp(expected_value, min_value, max_value)

    value = value_net(state, action)
    value_loss = value_criterion(value, expected_value.detach())
    
    policy_optimizer.zero_grad()
    policy_loss.backward()
    policy_optimizer.step()

    value_optimizer.zero_grad()
    value_loss.backward()
    value_optimizer.step()

    for target_param, param in zip(target_value_net.parameters(), value_net.parameters()):
                target_param.data.copy_(
                    target_param.data * (1.0 - soft_tau) + param.data * soft_tau
                )

    for target_param, param in zip(target_policy_net.parameters(), policy_net.parameters()):
            target_param.data.copy_(
                target_param.data * (1.0 - soft_tau) + param.data * soft_tau
            )

In [16]:
replay_buffer_size = 10000
replay_buffer = ReplayBuffer(replay_buffer_size)

memory = np.ones([user_num, N]) * -1
it = iter(train_loader)
batch = next(it)
user, item, label = batch["user"], batch["item"], batch["label"]
state = state_repr(user, item, memory)

Resetting dataset


In [18]:
preds = []

for j in tqdm.tqdm(range(1000)):
    action_emb = policy_net(state)
    action = policy_net.get_action(state, state_repr)
    preds.append(action)
    
    reward = torch.tensor(train_mat[list(user.detach().cpu().numpy()), list(action.detach().cpu().numpy())].toarray()[0])
#     for u, i, r in zip(user, item, reward):
#         update_item_representation(memory, u, i, r)
    update_item_representation(memory, user, item, reward)

    next_batch = next(it)
    user, item, label = next_batch["user"], next_batch["item"], next_batch["label"]
    next_state = state_repr(user, item, memory)

    replay_buffer.push(state.detach().cpu().numpy()[0], action_emb.detach().cpu().numpy()[0], 
                       reward, next_state.detach().cpu().numpy()[0])
    if len(replay_buffer) > batch_size:
        ddpg_update()

    state = next_state

100%|██████████| 1000/1000 [00:07<00:00, 132.29it/s]


## Predictions debug

In [19]:
it2 = iter(td.DataLoader(
    train_dataset, 
    batch_size=16, 
    shuffle=False,
    num_workers=8,
    sampler=SamplerWithReset(train_dataset)
))
batch = next(it2)
user, item, label = batch["user"], batch["item"], batch["label"]
state = state_repr(user, item, memory)

Resetting dataset


In [20]:
user

tensor([2640, 3840, 3107, 4841, 5949, 2763,  686, 2592, 5442, 4020, 3723, 1604,
        2914, 3894, 3329, 3386])

In [21]:
action = policy_net.get_action(state, state_repr)

In [22]:
action

tensor([2859, 2859, 2859, 2859,  856, 2859, 2859, 2859, 2859,  856,  856, 2859,
        2859, 2859, 2859,  856])

Модель постепенно сходится к одинаковым предсказаниям. 
Это сохраняется при уменьшении `lr` (число итераций до сходимости увеличивается пропорционально уменьшению `lr`).