# Replacing User Latent Vector With Behavioural Sequence RNN Output For CF Recommendation

##### For the situations that we * don't have fresh user embedding* 

#### Good side of user embedding
The latent space empowers the model to featurize the characters of user, entirely by learning.
#### Bad side of user embedding
Major problems of user embedding is always around the new user:

* We train a model with [latent cf + neural network](3.1.3_recommender_system.ipynb)
* Usually we won't retrain a model within the same day.
* If the new user some with his/her preference record, we have to train it in a way within the consistency of the old user's latent vector
* So we can't really apply the adavanced model to the fresh users.

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
# %cd /data
# !!wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
# !!unzip ml-latest-small.zip

In [3]:
%ls /data/ml-latest-small/

README.txt   links.csv    movies.csv   ratings.csv  tags.csv


In [42]:
DATA = "/data/ml-latest-small/"
DIM = 100 # dimension for embedding
SEQ_LEN = 19

In [5]:
files = os.listdir(DATA)
files

['links.csv', 'tags.csv', 'ratings.csv', 'README.txt', 'movies.csv']

In [6]:
data = dict()
for f in files:
    if f[-3:]=="csv":
        data[f.split(".")[0]] = pd.read_csv(DATA+f)

### Check the data

In [7]:
from IPython.display import display
list(display(k,v.sample(5)) for k,v in data.items())

'links'

Unnamed: 0,movieId,imdbId,tmdbId
6733,53996,418279,1858.0
3371,4232,227538,10054.0
2038,2542,120735,100.0
2959,3705,99141,1727.0
7732,84187,860906,22843.0


'tags'

Unnamed: 0,userId,movieId,tag,timestamp
915,547,2936,afi,1182393939
434,364,6539,sword fight,1444529953
864,546,48780,based on a book,1301715225
417,364,4973,quirky,1444528852
1193,547,112070,tivo,1476113970


'ratings'

Unnamed: 0,userId,movieId,rating,timestamp
61910,452,2112,3.0,976420133
35925,260,4995,2.5,1207985091
52106,380,89470,4.0,1344290433
94093,624,3596,1.0,1044272089
4743,24,380,4.0,849282414


'movies'

Unnamed: 0,movieId,title,genres
5681,25841,Stage Door (1937),Drama
1243,1562,Batman & Robin (1997),Action|Adventure|Fantasy|Thriller
3479,4397,Cannonball Run II (1984),Action|Comedy
3901,5038,"Flight of Dragons, The (1982)",Adventure|Animation|Children|Drama|Fantasy
2532,3148,"Cider House Rules, The (1999)",Drama


[None, None, None, None]

In [8]:
rate_df = data["ratings"]
len(rate_df)

100004

In [9]:
userId = list(set(data["ratings"]["userId"]))
movieId = list(set(data["ratings"]["movieId"]))
print(len(userId),len(movieId))

671 9066


In [10]:
u2i = dict((v,k) for k,v in enumerate(userId))
m2i = dict((v,k) for k,v in enumerate(movieId))
i2u = dict((k,v) for k,v in enumerate(userId))
i2m = dict((k,v) for k,v in enumerate(movieId))

In [11]:
rate_df["movieIdx"] = rate_df.movieId.apply(lambda x:m2i[x]).astype(int)
rate_df["userIdx"] = rate_df.userId.apply(lambda x:u2i[x]).astype(int)
rate_df["rating"] = rate_df["rating"]/5

In [12]:
gb = rate_df.sort_values(by=["userIdx","timestamp"]).groupby("userIdx")

gb.apply(lambda x:x.sample(n = 20, replace = False))

Unnamed: 0_level_0,Unnamed: 1_level_0,userId,movieId,rating,timestamp,movieIdx,userIdx
userIdx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,6,1,1287,0.4,1260759187,1149,0
0,11,1,1405,0.2,1260759203,1260,0
0,10,1,1371,0.5,1260759135,1226,0
0,18,1,2968,0.2,1260759200,2616,0
0,3,1,1129,0.4,1260759185,1004,0
0,0,1,31,0.5,1260759144,30,0
0,12,1,1953,0.8,1260759191,1697,0
0,2,1,1061,0.6,1260759182,949,0
0,8,1,1339,0.7,1260759125,1195,0
0,17,1,2455,0.5,1260759113,2169,0


In [13]:
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader

In [14]:
pick_k = np.array([0]*19 +[1])==1

In [23]:
def sample_split(x):
    sampled = x.sample(n = 20, replace = False)
    seq = sampled.head(19).sort_values(by="timestamp")
    y = sampled[pick_k]
    return pd.concat([seq,y])

class rnn_record(Dataset):
    def __init__(self, gb):
        self.gb = gb
        self.make_seq()
    
    def make_seq(self):
        self.all_seq = self.gb.apply(sample_split)
        
    def __len__(self):
        return len(self.gb)
        
    def __getitem__(self,idx):
        df = self.all_seq.loc[idx]
        seq = df.head(19)[["movieIdx","rating"]].values
        targ = df[pick_k][["movieIdx","rating"]].values
        targ_v, targ_y =targ[:,0], targ[:,1]
        return idx,seq,targ_v,targ_y
ds = rnn_record(gb)

In [16]:
# dl = DataLoader(ds,batch_size=32,shuffle=True)
# gen = iter(dl)

# idx,seq,targ_v,targ_y = next(gen)

# idx.size(),seq.size(),targ_v.size(),targ_y.size()

### Model

In [17]:
import torch
from torch import nn
from torch.nn import functional as F

In [36]:
class mLinkNet(nn.Module):
    def __init__(self, hidden_size,v_size):
        super(mLinkNet,self).__init__()
        self.hidden_size = hidden_size
        self.v_size = v_size
        self.emb = nn.Embedding(v_size,hidden_size)
        
        self.rnn = nn.GRU(input_size = self.hidden_size+1,
                          hidden_size= hidden_size+1,
                          num_layers=1,
                          batch_first = True,
                          dropout=0)
        
        self.mlp = nn.Sequential(*[
            nn.Dropout(.3),
            nn.Linear(hidden_size*2 + 1, 256, bias=False),
            nn.BatchNorm1d(256),
            nn.Linear(256,1,bias=False),
            nn.Sigmoid(),
        ])
    
    def forward(self,seq,targ_v):
        seq_vec = torch.cat([self.emb(seq[:,0].long()),
                             seq[:,1].unsqueeze(-1).float()], dim=2)
        output, hn = self.rnn(seq_vec)
#         print(hn.size(),self.emb(targ_v.long()).size())
        x = torch.cat([hn.squeeze(0),self.emb(targ_v.long()).squeeze(1)],dim=1)
        return self.mlp(x)

In [37]:
mln = mLinkNet(hidden_size = DIM, v_size = len(movieId))

# mln(seq, targ_v)

In [38]:
from torch.optim import Adam
from ray.matchbox import Trainer
opt = Adam(mln.parameters())
loss_func = nn.MSELoss()

In [39]:
trainer = Trainer(ds, batch_size=16, print_on=3)

In [40]:
train_len = len(trainer.train_data)
def action(*args,**kwargs):
    idx,seq,targ_v,y = args[0]
    y = y.float()
    
    opt.zero_grad()
    
    y_ = mln(seq, targ_v)
    loss = loss_func(y_,y)
    
    loss.backward()
    opt.step()
    mae = torch.mean(torch.abs(y_-y))
    if kwargs["ite"] == train_len - 1:
        trainer.train_data.dataset.make_seq()
    return {"loss":loss.item(),"mae":mae.item()}
trainer.action  = action

In [31]:
trainer.train(50)

⭐[ep_0_i_41]	loss	0.067✨	mae	0.220: 100%|██████████| 42/42 [00:02<00:00, 15.17it/s]
⭐[ep_1_i_41]	loss	0.057✨	mae	0.193: 100%|██████████| 42/42 [00:02<00:00, 14.90it/s]
⭐[ep_2_i_41]	loss	0.043✨	mae	0.168: 100%|██████████| 42/42 [00:02<00:00, 15.38it/s]
⭐[ep_3_i_41]	loss	0.045✨	mae	0.166: 100%|██████████| 42/42 [00:02<00:00, 15.25it/s]
⭐[ep_4_i_41]	loss	0.038✨	mae	0.159: 100%|██████████| 42/42 [00:02<00:00, 15.34it/s]
⭐[ep_5_i_41]	loss	0.037✨	mae	0.151: 100%|██████████| 42/42 [00:02<00:00, 15.63it/s]
⭐[ep_6_i_41]	loss	0.056✨	mae	0.185: 100%|██████████| 42/42 [00:02<00:00, 14.70it/s]
⭐[ep_7_i_41]	loss	0.048✨	mae	0.173: 100%|██████████| 42/42 [00:02<00:00, 15.34it/s]
⭐[ep_8_i_41]	loss	0.046✨	mae	0.165: 100%|██████████| 42/42 [00:02<00:00, 15.16it/s]
⭐[ep_9_i_41]	loss	0.027✨	mae	0.137: 100%|██████████| 42/42 [00:02<00:00, 15.16it/s]
⭐[ep_10_i_41]	loss	0.037✨	mae	0.161: 100%|██████████| 42/42 [00:02<00:00, 15.52it/s]
⭐[ep_11_i_41]	loss	0.032✨	mae	0.147: 100%|██████████| 42/42 [00:02<00:00, 1

In [41]:
trainer.train(50)

⭐[ep_0_i_41]	loss	0.077✨	mae	0.244: 100%|██████████| 42/42 [00:02<00:00, 14.82it/s]
⭐[ep_1_i_41]	loss	0.040✨	mae	0.155: 100%|██████████| 42/42 [00:02<00:00, 15.09it/s]
⭐[ep_2_i_41]	loss	0.051✨	mae	0.171: 100%|██████████| 42/42 [00:02<00:00, 15.44it/s]
⭐[ep_3_i_41]	loss	0.044✨	mae	0.164: 100%|██████████| 42/42 [00:02<00:00, 15.41it/s]
⭐[ep_4_i_41]	loss	0.035✨	mae	0.146: 100%|██████████| 42/42 [00:02<00:00, 15.12it/s]
⭐[ep_5_i_41]	loss	0.050✨	mae	0.187: 100%|██████████| 42/42 [00:02<00:00, 15.47it/s]
⭐[ep_6_i_41]	loss	0.046✨	mae	0.162: 100%|██████████| 42/42 [00:02<00:00, 15.44it/s]
⭐[ep_7_i_41]	loss	0.047✨	mae	0.172: 100%|██████████| 42/42 [00:02<00:00, 15.04it/s]
⭐[ep_8_i_41]	loss	0.044✨	mae	0.174: 100%|██████████| 42/42 [00:02<00:00, 15.35it/s]
⭐[ep_9_i_41]	loss	0.047✨	mae	0.168: 100%|██████████| 42/42 [00:02<00:00, 15.13it/s]
⭐[ep_10_i_41]	loss	0.044✨	mae	0.177: 100%|██████████| 42/42 [00:02<00:00, 15.00it/s]
⭐[ep_11_i_41]	loss	0.046✨	mae	0.177: 100%|██████████| 42/42 [00:02<00:00, 1