In [1]:
#Colab setting
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/My Drive/RecommanderSystems')

import os
os.chdir('/content/drive/My Drive/RecommanderSystems')

Mounted at /content/drive


In [3]:
from types import SimpleNamespace

config = SimpleNamespace(
    seed = 1,
    data = "Steam.txt", # Beauty.txt / ml-1m.txt
    dropout = 0.5, # 0.2 for ml-1m
    n = 50, # 300 for ml-1m, 50 for others
    d = 40,
    batch_size = 128,
    num_heads = 1, # default
    test_batch_size = 100,
    weight_decay = 5e-4,
    num_blocks = 2,
    lr = 1e-3,
    epoch = 200,
    patience = 20

)

## Dataset

SASRec의 경우 학습을 시킬 때 user 당 item의 시퀀스가 필요

In [4]:
from collections import defaultdict

user_item_sequence_dict = defaultdict(list)

u_max = -1
i_max = -1

data = open(config.data, 'r')
for line in data:
  u, i = line.strip().split(' ') # (user, item) 조합의 txt 파일 읽기
  u = int(u)
  i = int(i)
  user_item_sequence_dict[u].append(i)
  if u > u_max:
    u_max = u
  if i > i_max:
    i_max = i

user_seq = {}
user_train = {}
user_valid = {}
user_test = {}

for u in user_item_sequence_dict:
  seq = user_item_sequence_dict[u]
  l = len(seq)
  user_valid[u] = []
  user_test[u] = []
  if l < 3:
    padded_seq = [0] * (config.n - l) + seq
    user_train[u] = padded_seq

  elif l < config.n:
    # padding 필요
    padded_seq = [0] * (config.n - l) + seq
    user_train[u] = padded_seq[:-2]
    user_valid[u] = padded_seq[-2] # n - 1번째
    user_test[u] = padded_seq[-1] # n번째
  else:
    # l > config.n
    padded_seq = seq[-config.n:]
    user_train[u] = padded_seq[:-2]
    user_valid[u] = padded_seq[-2]
    user_test[u] = padded_seq[-1]
  user_seq[u] = padded_seq

print("u_max", u_max)
print("i_max", i_max)
print(len(padded_seq))

u_max 334730
i_max 13047
5


# Head

In [1]:
import torch
import torch.nn as nn
import math

class MultiheadAttention(nn.Module):
  def __init__(self, embed_dim, num_heads, kdim = None, vdim = None):
    """

    embed_dim: Dimensionality of the input and output embeddings.
    num_heads: Number of attention heads.

    """
    super().__init__()

    self.embed_dim = embed_dim
    self.num_heads = num_heads

    self.kdim = kdim if kdim is not None else embed_dim
    self.vdim = vdim if vdim is not None else embed_dim

    # 그냥 Q, K, V 사이즈 통일
    self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
    self.k_proj = nn.Linear(self.kdim, self.embed_dim)
    self.v_proj = nn.Linear(self.vdim, self.embed_dim)
    self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

    # nn.init.uniform_(self.q_proj.bias)
    # nn.init.uniform_(self.k_proj.bias)
    nn.init.zeros_(self.q_proj.bias)
    nn.init.zeros_(self.k_proj.bias)
    nn.init.zeros_(self.v_proj.bias)
    nn.init.zeros_(self.out_proj.bias)

  def split_heads(self, x):
    """
    Head 늘리는 방식 중에 embedding 사이즈 줄여서 head수 늘리는 방식

    x : (batch_size, -1, embed_dim).

    Returns:
      (batch_size, num_heads, -1, embed_dim // num_heads).

    """
    n_batch = x.shape[0]
    splited = x.reshape(n_batch, -1, self.num_heads, self.embed_dim // self.num_heads)

    return splited.transpose(1, 2) # (B, num_heads, -1, embed_dim // num_heads)

  def scaled_dot_product_attention(self, wq, wk, wv, pad_mask = None):
    """

    wq, wk, wv: (B, num_heads, n_seq, embed_dim // num_heads).

    Returns:
      Scaled Dot-Product Attention  (B, n_seq, embed_dim).
      Average attention weights across heads (B, n_seq, n_key) -시각화에 사용

    """

    n_batch = wq.shape[0]
    d_k = self.embed_dim // self.num_heads

    """
    wq @ wk.T: (B, num_heads, n_seq, n_key)
    pad_mask: (B, n_key)
    softmax @ v: (B, num_heads, n_seq, embed_dim // num_heads)
    """
    print("wq (B, num_heads, n_seq, embed_dim)", wq.shape)
    wk_t = wk.transpose(2, 3) # (B, num_heads, embed_dim // num_heads, n_key)
    print("wk_t", wk_t.shape, "wq", wq.shape)
    similarity = torch.matmul(wq, wk_t) # (B, num_heads, n_seq, n_key)
    similarity /= math.sqrt(d_k)
    epsilon = -1e9

    print("similarity", similarity.shape)

    if pad_mask is not None:
      similarity = similarity.masked_fill(pad_mask.unsqueeze(1).unsqueeze(2), epsilon) # (B, 1, 1, n_key)
    print("similarity", similarity.shape)
    softmax_sim = torch.softmax(similarity, dim = 3) # key마다 얼마나 비슷 -> softmax (B, num_heads, n_seq, n_key)
    average_att_w = torch.mean(softmax_sim, dim = 1) # head 마다 (B, n_seq, n_key) 평균

    # average_att_w 찍어보면 각 key 마다의 활성화정도 시각화 가능

    attention_value = torch.matmul(softmax_sim, wv) # (B, num_heads, n_seq, embed_dim // num_heads)
    attention_value = attention_value.transpose(1, 2) # (B, n_key, num_heads, embed_dim // num_heads)
    # 사이즈 원상 복구 근데 이게 맞나??? 좀 이상함,,
    attention_value = attention_value.reshape(n_batch, -1, self.embed_dim) # (B, n_key, embed_dim)
    print("attention_value", attention_value.shape)
    return attention_value, average_att_w

  def forward(self, q, k, v, pad_mask = None):
    """
    q: (B, n_seq, embed_dim)
    k: (B, n_key, kdim)
    v: (B, n_key, vdim)
    pad_mask: (B, n_key)

    Returns:
      output (B, -1, embed_dim)
      average_att_w: (B, n_seq, n_key)

    """
    print("Multihead input q", q.shape)
    wq = self.q_proj(q) #(B, n_seq, embed_dim)
    wk = self.k_proj(k) #(B, n_key, kdim)
    wv = self.v_proj(v) # #(B, n_key, embed_dim)

    wk = self.split_heads(wk)
    wq = self.split_heads(wq)
    wv = self.split_heads(wv)

    x, attn = self.scaled_dot_product_attention(wq, wk, wv, pad_mask)
    x = self.out_proj(x) # (B, n_key, embed_dim)
    print("x, attn", x.shape, attn.shape)
    return x, attn

class FeedForward(nn.Module):
  '''
  RELU(SW(1)+b(1))W(2) + b(2)
  W(1) & W(2) : (d, d)

  '''
  def __init__(self, embedding_dim, dropout_rate = 0.5):
    super().__init__()
    self.linear1 = nn.Linear(embedding_dim, embedding_dim) #(d, d)
    self.linear2 = nn.Linear(embedding_dim, embedding_dim)
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(p=dropout_rate)

  def forward(self, x):
    x_ = self.linear1(x_)
    x_ = self.dropout(x_) # 이게 중간에 들어가는지 의문,,
    x_ = self.relu(x_)
    x_ = self.linear2(x_)
    x_ = x + self.dropout(x_)

    return x_


In [39]:
pad_mask = torch.tensor([[False, True], [True, False]])
print(pad_mask.shape)
similarity = torch.tensor([[[[1, 2]], [[3, 4]]], [[[5, 6]], [[7, 8]]]]) # (2, 2, 1, 2)
print(similarity.shape)
print(similarity)
s =similarity.masked_fill(pad_mask.unsqueeze(1).unsqueeze(2), 1e-9) # (B, 1, 1, n_key)
print(s)

torch.Size([2, 2])
torch.Size([2, 2, 1, 2])
tensor([[[[1, 2]],

         [[3, 4]]],


        [[[5, 6]],

         [[7, 8]]]])
tensor([[[[1, 0]],

         [[3, 0]]],


        [[[0, 6]],

         [[0, 8]]]])


# Encoder Block

In [2]:
class EncoderBlock(nn.Module):
  def __init__(self, embedding_dim, num_heads, dropout_rate = 0.5):
    super().__init__()

    self.attention = MultiheadAttention(embed_dim = embedding_dim, num_heads = num_heads)
    self.feedforward = FeedForward(embedding_dim)
    self.layernorm = nn.LayerNorm(embedding_dim, eps = 1e-8)
    self.dropout = nn.Dropout(p=dropout_rate)

  def forward(self, x, mask = None):
    '''
    Residual connection
    Q, K, V로 모두 같은 값 사용

    g(x) = x + Dropout(g(LayerNorm(x)))
    '''
    # Self attention Layer
    print("Encoder block input x", x.shape)
    x_ = self.layernorm(x)
    res1, attn = self.attention.forward(x_, x_, x_, pad_mask = mask)
    print("res1", res1.shape)
    res1 = self.dropout(res1)

    x = x + res1

    # FFN
    x_ = self.layernorm(x)
    res2 = self.feedforward(x_)
    res2 = self.dropout(res2)
    x = x + res2

    return x, attn

# SASRec model

In [5]:
import torch
import torch.nn as nn
import numpy as np
u_max =334730
i_max =13047
class SASRec(nn.Module):
  def __init__(self, config, user_num = u_max, item_num = i_max):
    super(SASRec, self).__init__()

    self.user_num = user_num
    self.item_num = item_num
    self.config = config
    self.num_blocks = config.num_blocks
    self.embedding_dim = self.config.d
    self.num_heads = config.num_heads

    print("user_num:", user_num) # maybe batch size
    print("item_num:", item_num)
    print("embedding_dim", self.embedding_dim)
    print("num_heads:", self.num_heads)
    print("n:", config.n)

    # M : item embedding (N X d) 인데 item index는 1 부터 시작 => + 1해주기
    self.M = torch.nn.Embedding(self.item_num + 1, self.config.d, padding_idx = 0) # 패딩은 0으로

    self.pos_enc = nn.Parameter(torch.randn(self.config.n, self.config.d)) # learnable PE (n, d)
    # nn.Embedding(self.config.n, self.config.d)
    self.droptout = nn.Dropout(self.config.dropout)
    self.encoders = nn.ModuleList()

    for block in range(self.num_blocks):
      self.encoders.append(EncoderBlock(self.embedding_dim, self.num_heads))

    self.last_layernorm = torch.nn.LayerNorm(self.embedding_dim, eps=1e-8)

  def forward(self, x):
    '''
     x: (n_batch, seq_length = config.n - 2)
     valid랑 test용 빠진 애들 들어오고
    '''
    print("x shape", x.shape)
    embedded_seq = self.M(torch.LongTensor(x)) # id는 정수들의 sequence (n_batch, config.n, embedding_dim)
    print("embedded_seq", embedded_seq.shape)
    embedded_seq = embedded_seq + self.pos_enc
    pad_mask = torch.BoolTensor(x == 0)

    # embedded_seq = embedded_seq.masked_fill(pad_mask.unsqueeze(-1))

    # future item 가리기
    attention_mask = ~torch.tril(torch.ones((x.shape[1], x.shape[1]), dtype = torch.bool))

    for encoder in self.encoders:
      embedded_seq = embedded_seq.masked_fill(pad_mask.unsqueeze(-1), 0)
      embedded_seq, attn = encoder(embedded_seq, attention_mask)

    output = embedded_seq # (B, n_key, embed_dim)
    print("output", output.shape)
    # positive sequence : left shifted
    seq = np.zeros([self.config.n], dtype = np.int32)
    pos_seq = np.zeros([self.config.n], dtype = np.int32)
    neg_seq = np.zeros([self.config.n], dtype = np.int32)

    for u in range(x.shape[0]):
      pos_seq[2:] = x[u][:-2] # config.n

    pos_embs = self.M(torch.LongTensor(pos_seq))
    neg_embs = self.M(torch.LongTensor(neg_seq))


In [7]:
embed_dim = 6
num_heads = 3
kdim = 3
vdim = 3

from types import SimpleNamespace

my_attention = MultiheadAttention(
    embed_dim = embed_dim,
    num_heads = num_heads,
    kdim = kdim,
    vdim = vdim )

print(my_attention)

my_config = SimpleNamespace(
    seed = 1,
    data = "Steam.txt", # Beauty.txt / ml-1m.txt
    dropout = 0.5, # 0.2 for ml-1m
    n = 5, # 300 for ml-1m
    d = 3,
    batch_size = 2,
    test_batch_size = 1,
    weight_decay = 5e-4,
    num_blocks = 2,
    lr = 1e-3,
    epoch = 20,
    patience = 20,
    num_heads =3

)
my_sasrec = SASRec(
    my_config
)
us = [1, 2, 3, 4, 5]
my_sasrec.forward(torch.tensor([us]))


MultiheadAttention(
  (q_proj): Linear(in_features=6, out_features=6, bias=True)
  (k_proj): Linear(in_features=3, out_features=6, bias=True)
  (v_proj): Linear(in_features=3, out_features=6, bias=True)
  (out_proj): Linear(in_features=6, out_features=6, bias=True)
)
user_num: 334730
item_num: 13047
embedding_dim 3
num_heads: 3
n: 5
x shape torch.Size([1, 5])
embedded_seq torch.Size([1, 5, 3])
Encoder block input x torch.Size([1, 5, 3])
Multihead input q torch.Size([1, 5, 3])
wq (B, num_heads, n_seq, embed_dim) torch.Size([1, 3, 5, 1])
wk_t torch.Size([1, 3, 1, 5]) wq torch.Size([1, 3, 5, 1])
similarity torch.Size([1, 3, 5, 5])
similarity torch.Size([5, 3, 5, 5])
attention_value torch.Size([1, 25, 3])
x, attn torch.Size([1, 25, 3]) torch.Size([5, 5, 5])
res1 torch.Size([1, 25, 3])


RuntimeError: The size of tensor a (5) must match the size of tensor b (25) at non-singleton dimension 1

AttributeError: 'list' object has no attribute 'shape'

In [21]:
print(user_seq[1])

[2570, 3240, 3634, 3838, 4178, 4688, 4895, 5030, 5185, 5249, 5251, 5762, 6166, 6589, 6775, 6853, 7151, 7203, 7225, 7262, 8326, 8432, 9074, 9594, 9697, 9793, 9971, 9975, 10361, 10367, 10789, 11025, 11102, 11292, 11349, 11407, 11535, 11771, 12047, 12222, 12483, 12891, 12920, 1563, 2832, 4230, 4528, 9584, 10084, 8112]
