## 20Mn data MovieLens Experiment

In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torch.optim import Adam

In [2]:
from ray.lprint import lprint
l = lprint("experiment with RNN+CF on movielens 20m data")

[task:experiment with RNN+CF on movielens 20m data>>start]<2018-09-27_04:04:17|0s,0s>	


In [3]:
CUDA = torch.cuda.is_available()
SEQ_LEN = 19
DIM = 100
l.p("has GPU cuda",CUDA)

[has GPU cuda]<2018-09-27_04:04:18|0s,0s>	True


In [4]:
# %ls /data/ml-20m

In [5]:
DATA = "/data/ml-20m/ratings.csv"

In [6]:
l.p("loading csv file", DATA)
rate_df = pd.read_csv(DATA)
l.p("csv file loaded")

[loading csv file]<2018-09-27_04:04:18|0s,0s>	/data/ml-20m/ratings.csv
[csv file loaded]<2018-09-27_04:04:24|6s,6s>	


In [7]:
len(rate_df)

20000263

In [8]:
rate_df.groupby("userId").count()["movieId"].min()
# The minimum number of movies a user have rated

20

In [9]:
userId = list(set(rate_df["userId"]))
movieId = list(set(rate_df["movieId"]))
print("total number of users and movies:\t",len(userId),"\t",len(movieId))

total number of users and movies:	 138493 	 26744


In [10]:
l.p("making dictionary")
u2i = dict((v,k) for k,v in enumerate(userId))
m2i = dict((v,k) for k,v in enumerate(movieId))
i2u = dict((k,v) for k,v in enumerate(userId))
i2m = dict((k,v) for k,v in enumerate(movieId))

[making dictionary]<2018-09-27_04:04:27|3s,9s>	


In [11]:
rate_df["movieIdx"] = rate_df.movieId.apply(lambda x:m2i[x]).astype(int)
rate_df["userIdx"] = rate_df.userId.apply(lambda x:u2i[x]).astype(int)
rate_df["rating"] = rate_df["rating"]/5

In [12]:
l.p("making train/test split")
user_count = len(userId)
valid_split = np.random.rand(user_count)>.9
train_idx = np.array(range(user_count))[~valid_split]
valid_idx = np.array(range(user_count))[valid_split]

train_df = rate_df[rate_df.userId.isin(train_idx)]
valid_df = rate_df[rate_df.userId.isin(valid_idx)]

[making train/test split]<2018-09-27_04:04:41|14s,24s>	


Since user id mapping doesn't matter any more.

It's easier to make a dataset with contineous user_id.

In [None]:
train_u2i = dict((v,k) for k,v in enumerate(set(train_df.userId)))
valid_u2i = dict((v,k) for k,v in enumerate(set(valid_df.userId)))
train_df["userId"] = train_df.userId.apply(lambda x:train_u2i[x])
valid_df["userId"] = valid_df.userId.apply(lambda x:valid_u2i[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [None]:
l.p("generating groubby slice")
def get_user_trail(rate_df):
    return rate_df.sort_values(by=["userId","timestamp"]).groupby("userId")
    #gb.apply(lambda x:x.sample(n = 20, replace = False))
gb = get_user_trail(rate_df)
train_gb = get_user_trail(train_df)
valid_gb = get_user_trail(valid_df)

[generating groubby slice]<2018-09-27_04:04:52|11s,35s>	


In [None]:
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader

In [None]:
pick_k = np.array([0]*SEQ_LEN +[1])==1

In [None]:
def sample_split(x):
    sampled = x.sample(n = 20, replace = False)
    seq = sampled.head(19).sort_values(by="timestamp")
    y = sampled[pick_k]
    return pd.concat([seq,y])

class rnn_record(Dataset):
    def __init__(self, gb):
        self.gb = gb
        self.make_seq()
    
    def make_seq(self):
        self.all_seq = self.gb.apply(sample_split)
        
    def __len__(self):
        return len(self.gb)
        
    def __getitem__(self,idx):
        df = self.all_seq.loc[idx]
        seq = df.head(19)[["movieIdx","rating"]].values
        targ = df[pick_k][["movieIdx","rating"]].values
        targ_v, targ_y =targ[:,0], targ[:,1]
        return idx,seq,targ_v,targ_y

# ds = rnn_record(gb)
l.p("generating dataset","train")
train_ds = rnn_record(train_gb)
l.p("generating dataset","valid")
valid_ds = rnn_record(valid_gb)
l.p("dataset generated")

### Model

In [None]:
class mLinkNet(nn.Module):
    def __init__(self, hidden_size,v_size):
        """
        mLinkNet, short for missing link net
        """
        super(mLinkNet,self).__init__()
        self.hidden_size = hidden_size
        self.v_size = v_size
        self.emb = nn.Embedding(v_size,hidden_size)
        
        self.rnn = nn.GRU(input_size = self.hidden_size+1,
                          hidden_size= hidden_size+1,
                          num_layers=1,
                          batch_first = True,
                          dropout=0)
        
        self.mlp = nn.Sequential(*[
            nn.Dropout(.3),
            nn.Linear(hidden_size*2 + 1, 256, bias=False),
            nn.BatchNorm1d(256),
            nn.Linear(256,1,bias=False),
            nn.Sigmoid(),
        ])
    
    def forward(self,seq,targ_v):
        seq_vec = torch.cat([self.emb(seq[:,0].long()),
                             seq[:,1].unsqueeze(-1).float()], dim=2)
        output, hn = self.rnn(seq_vec)
        x = torch.cat([hn.squeeze(0),self.emb(targ_v.long()).squeeze(1)],dim=1)
        return self.mlp(x)

In [None]:
l.p("creating model")
mln = mLinkNet(hidden_size = DIM, 
               v_size = len(movieId))
if CUDA:
    l.p("loading model to GPU")
    torch.cuda.empty_cache()
    mln.cuda()
    
from ray.matchbox import Trainer
opt = Adam(mln.parameters())
loss_func = nn.MSELoss()
trainer = Trainer(train_ds, val_dataset=valid_ds, batch_size=16, print_on=3)

In [None]:
train_len = len(trainer.train_data)
valid_len = len(trainer.val_data)
l.p("train_len",train_len)
l.p("valid_len",valid_len)
def action(*args,**kwargs):
    # get data from data feeder
    idx,seq,targ_v,y = args[0]
    if CUDA:
        seq,targ_v,y = seq.cuda(),targ_v.cuda(),y.cuda()
    y = y.float()
    
    # Clear the Jacobian Matrix
    opt.zero_grad()
    
    # Predict y hat
    y_ = mln(seq, targ_v)
    # Calculate Loss
    loss = loss_func(y_,y)
    
    # Backward Propagation
    loss.backward()
    opt.step()
    # Mean Absolute Loss as print out metrics
    mae = torch.mean(torch.abs(y_-y))
    if kwargs["ite"] == train_len - 1: # resample the sequence
        trainer.train_data.dataset.make_seq()
    return {"loss":loss.item(),"mae":mae.item()}

def val_action(*args,**kwargs):
    """
    A validation step
    Exactly the same like train step, but no learning, only forward pass
    """
    idx,seq,targ_v,y = args[0]
    if CUDA:
        seq,targ_v,y = seq.cuda(),targ_v.cuda(),y.cuda()
    y = y.float()
    
    y_ = mln(seq, targ_v)
    
    loss = loss_func(y_,y)
    mae = torch.mean(torch.abs(y_-y))
    if kwargs["ite"] == valid_len - 1:
        trainer.val_data.dataset.make_seq()
    return {"loss":loss.item(),"mae":mae.item()}
trainer.action  = action
trainer.val_action  = val_action

In [None]:
l.p("start training")
trainer.train(2)
l.p("training finished")