## Group No

## Group Member Names:
1. MOHAMMED RASUL. E.S
2. GOPIKA J
3. SAHIL MATTOO
4. APURBA ROY

## Journal used for the implemetation
Journal title: Personalized Transformer for Explainable Recommendation

Authors: Lei Li1 Yongfeng Zhang2 Li Chen1 1Hong Kong Baptist University, Hong Kong, China 2Rutgers University, New Brunswick, USA
1 {csleili,lichen}@comp.hkbu.edu.hk 2 yongfeng.zhang@rutgers.edu


Journal Name: Published as a conference paper at ACL-IJCNLP 2021

Year:2021

# Import Libraries

In [1]:
import math
import os
from tempfile import TemporaryDirectory
from typing import Tuple

import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset
from torchtext.vocab import vocab
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

from collections import Counter

from zipfile import ZipFile
from urllib.request import urlretrieve

import pandas as pd
import numpy as np

import time



# Set Current Working Directory

In [4]:
import os
os.chdir("/content/drive/MyDrive/[00] BITS/SEM2/DNN_Assignment_2/")

# Prepare Data

In [5]:
urlretrieve("http://files.grouplens.org/datasets/movielens/ml-1m.zip", "movielens.zip")
ZipFile("movielens.zip", "r").extractall()

# Datasets
users = pd.read_csv(
    "ml-1m/users.dat",
    sep="::",
    names=["user_id", "sex", "age_group", "occupation", "zip_code"],
)

ratings = pd.read_csv(
    "ml-1m/ratings.dat",
    sep="::",
    names=["user_id", "movie_id", "rating", "unix_timestamp"],
)

movies = pd.read_csv(
    "ml-1m/movies.dat", sep="::", names=["movie_id", "title", "genres"], encoding='latin-1'
)

  users = pd.read_csv(
  ratings = pd.read_csv(
  movies = pd.read_csv(


In [6]:
!mkdir model_data

mkdir: cannot create directory ‘model_data’: File exists


In [7]:
!ls

assignment_version2.ipynb  configure_model.ipynb  DNN_Assignment_2.ipynb  model_data	 utils
buildtransformer.ipynb	   Data			  ml-1m			  movielens.zip


### Pickle Files to persist the data

In [8]:
users.to_pickle("./model_data/users.pkl")
ratings.to_pickle("./model_data/ratings.pkl")
movies.to_pickle("./model_data/movies.pkl")

In [9]:
users.head()

Unnamed: 0,user_id,sex,age_group,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [10]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [11]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## Preprocessing of Data

### Preventing ids to be written as integer or float data type

In [12]:

users["user_id"] = users["user_id"].apply(lambda x: f"user_{x}")

movies["movie_id"] = movies["movie_id"].apply(lambda x: f"movie_{x}")

ratings["movie_id"] = ratings["movie_id"].apply(lambda x: f"movie_{x}")
ratings["user_id"] = ratings["user_id"].apply(lambda x: f"user_{x}")

### Creating Vocabulary

In [14]:
# Genarting a list of unique movie ids
movie_ids = movies.movie_id.unique()

# Counter is used to feed movies to movive_vocab
movie_counter = Counter(movie_ids)

# Genarting vocabulary
movie_vocab = vocab(movie_counter, specials=['<unk>'])

# For indexing input ids
movie_vocab_mapping = movie_vocab.get_stoi()

# Movie to title mapping dictionary
movie_title_dict = dict(zip(movies.movie_id, movies.title))

# Similarly generating a vocabulary for user ids
user_ids = users.user_id.unique()
user_counter = Counter(user_ids)
user_vocab = vocab(user_counter, specials=['<unk>'])
user_vocab_mapping = user_vocab.get_stoi()

### Mappings


*   User Mappings
*   Movie Mappings


    

    

In [28]:
print("User Mapping \n")
count = 1
for key, value in user_vocab_mapping.items():
  if count<5:
    print(key,value)
    count += count

print("\n")

print("Movie Mapping \n")
count = 1
for key, value in movie_vocab_mapping.items():
  if count<5:
    print(key,value)
    count += count





User Mapping 

user_6040 6040
user_6039 6039
user_6038 6038


Movie Mapping 

movie_3951 3882
movie_3949 3880
movie_3947 3878


### Generating Sequences

All interactions of users are first sorted by their interaction timestamp and then divided into sub sequences to train our model.

For Transformer based Recommendation, the input will be considered as **list of movies**.

The Sequence function helps in training the model

In [30]:
movie_vocab_stoi = movie_vocab_mapping
user_vocab_stoi = user_vocab_mapping

In [32]:
# Group ratings by user_id in order of increasing unix_timestamp.
ratings_group = ratings.sort_values(by=["unix_timestamp"]).groupby("user_id")

ratings_data = pd.DataFrame(
    data={
        "user_id": list(ratings_group.groups.keys()),
        "movie_ids": list(ratings_group.movie_id.apply(list)),
        "timestamps": list(ratings_group.unix_timestamp.apply(list)),
    }
)

# Sequence length, min history count and window slide size
sequence_length = 4
min_history = 1
step_size = 2

# Creating sequences from lists with sliding window
def create_sequences(values, window_size, step_size, min_history):
  sequences = []
  start_index = 0
  while len(values[start_index:]) > min_history:
    seq = values[start_index : start_index + window_size]
    sequences.append(seq)
    start_index += step_size
  return sequences

ratings_data.movie_ids = ratings_data.movie_ids.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size, min_history)
)


del ratings_data["timestamps"]

# Sub-sequences are exploded.
# Since there might be more than one sequence for each user.
ratings_data_transformed = ratings_data[["user_id", "movie_ids"]].explode(
    "movie_ids", ignore_index=True
)

ratings_data_transformed.rename(
    columns={"movie_ids": "sequence_movie_ids"},
    inplace=True,
)

print(" For every User there is sequence of movies as User Preference \n")
ratings_data_transformed.sample(frac=1).reset_index(drop=True).head()


 For every User there is sequence of movies as User Preference 



Unnamed: 0,user_id,sequence_movie_ids
0,user_531,"[movie_1411, movie_296, movie_593, movie_1393]"
1,user_2188,"[movie_2976, movie_2010, movie_1916, movie_1310]"
2,user_5683,"[movie_261, movie_2353, movie_733, movie_368]"
3,user_310,"[movie_1196, movie_1197, movie_2692, movie_1961]"
4,user_945,"[movie_1357, movie_2671, movie_339, movie_3244]"


### Train Test Split

In [33]:
# Random indexing
random_selection = np.random.rand(len(ratings_data_transformed.index)) <= 0.65

# Split train data
df_train_data = ratings_data_transformed[random_selection]
trainingData= df_train_data[["user_id", "sequence_movie_ids"]].values

# Split test data
df_test_data = ratings_data_transformed[~random_selection]
testingData = df_test_data[["user_id", "sequence_movie_ids"]].values

In [35]:
trainingData[:5, :]

array([['user_1',
        list(['movie_1022', 'movie_1721', 'movie_2340', 'movie_1836'])],
       ['user_1',
        list(['movie_3408', 'movie_1207', 'movie_2804', 'movie_1193'])],
       ['user_1',
        list(['movie_2804', 'movie_1193', 'movie_720', 'movie_260'])],
       ['user_1',
        list(['movie_919', 'movie_608', 'movie_2692', 'movie_1961'])],
       ['user_1',
        list(['movie_2692', 'movie_1961', 'movie_2028', 'movie_3105'])]],
      dtype=object)

In [36]:
testingData[:5, :]

array([['user_1',
        list(['movie_3186', 'movie_1270', 'movie_1022', 'movie_1721'])],
       ['user_1',
        list(['movie_2340', 'movie_1836', 'movie_3408', 'movie_1207'])],
       ['user_1',
        list(['movie_720', 'movie_260', 'movie_919', 'movie_608'])],
       ['user_1',
        list(['movie_2028', 'movie_3105', 'movie_938', 'movie_1035'])],
       ['user_1',
        list(['movie_2018', 'movie_1028', 'movie_1097', 'movie_914'])]],
      dtype=object)

### Creating Batches

In [38]:
# Pytorch Dataset for user interactions
class MovieSeqDataset(Dataset):
    # Initialize dataset
    def __init__(self, data, movie_vocab_stoi, user_vocab_stoi):
        self.data = data

        self.movie_vocab_stoi = movie_vocab_stoi
        self.user_vocab_stoi = user_vocab_stoi


    def __len__(self):
        return len(self.data)

    # Fetch data from the dataset
    def __getitem__(self, idx):
        user, movie_sequence = self.data[idx]
        # Directly index into the vocabularies
        movie_data = [self.movie_vocab_stoi[item] for item in movie_sequence]
        user_data = self.user_vocab_stoi[user]
        return torch.tensor(movie_data), torch.tensor(user_data)


# Collate function and padding
def collate_batch(batch):
    movie_list = [item[0] for item in batch]
    user_list = [item[1] for item in batch]
    return pad_sequence(movie_list, padding_value=movie_vocab_stoi['<unk>'], batch_first=True), torch.stack(user_list)




### Process Training Data into DataLoaders object for batchwise iteration

In [39]:
## Taking Raw Data
train_data_raw = trainingData
test_data_raw = testingData


## Converting RawData into DataLoaders

BATCH_SIZE_data = 256
# Create instances of your Dataset for each set
train_dataset = MovieSeqDataset(train_data_raw, movie_vocab_stoi, user_vocab_stoi)
val_dataset = MovieSeqDataset(test_data_raw, movie_vocab_stoi, user_vocab_stoi)
# Create DataLoaders
train_iter = DataLoader(train_dataset, batch_size=BATCH_SIZE_data,
                        shuffle=True, collate_fn=collate_batch)
val_iter = DataLoader(val_dataset, batch_size=BATCH_SIZE_data,
                      shuffle=False, collate_fn=collate_batch)


# Transformer Model



1.   Self Attention

  a) Multi head Attention - setting up Q,K,V pair

  b) scaled_dot_product_attention
  
2.   Positional Encoding
3.   Encoding Layer



In [40]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy


## Multi Head Attention
class MultiHeadAttentionCustom(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttentionCustom, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output


class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

## Position Encoding
class PositionalEncodingCustom(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncodingCustom, self).__init__()

        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

## Encode Layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttentionCustom(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x



## Building custom Transformer

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)

        # `div_term` is used in the calculation of the sinusoidal values.
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))

        # Initializing positional encoding matrix with zeros.
        pe = torch.zeros(max_len, 1, d_model)

        # Calculating the positional encodings.
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, ntoken: int, nuser: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        # positional encoder
        self.pos_encoder = PositionalEncoding(d_model, dropout)

        # Multihead attention mechanism.
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)

        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)

        # Embedding layers
        self.movie_embedding = nn.Embedding(ntoken, d_model)
        self.user_embedding = nn.Embedding(nuser, d_model)

        # Defining the size of the input to the model.
        self.d_model = d_model

        # Linear layer to map the output tomovie vocabulary.
        self.linear = nn.Linear(2*d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        # Initializing the weights of the embedding and linear layers.
        initrange = 0.1
        self.movie_embedding.weight.data.uniform_(-initrange, initrange)
        self.user_embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, user: Tensor, src_mask: Tensor = None) -> Tensor:
        # Embedding movie ids and userid
        movie_embed = self.movie_embedding(src) * math.sqrt(self.d_model)
        user_embed = self.user_embedding(user) * math.sqrt(self.d_model)

        # positional encoding
        movie_embed = self.pos_encoder(movie_embed)

        # generating output with final layers
        output = self.transformer_encoder(movie_embed, src_mask)

        # Expand user_embed tensor along the sequence length dimension
        user_embed = user_embed.expand(-1, output.size(1), -1)

        # Concatenate user embeddings with transformer output
        output = torch.cat((output, user_embed), dim=-1)

        output = self.linear(output)
        return output


class TransformerModelCustom(nn.Module):
    def __init__(self, ntoken: int, nuser: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        # positional encoder
        self.pos_encoder = PositionalEncoding(d_model, dropout)

        # Multihead attention mechanism.
        #encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)

        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)

        # Embedding layers
        self.movie_embedding = nn.Embedding(ntoken, d_model)
        self.user_embedding = nn.Embedding(nuser, d_model)

        # Defining the size of the input to the model.
        self.d_model = d_model

        # Linear layer to map the output tomovie vocabulary.
        self.linear = nn.Linear(2*d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        # Initializing the weights of the embedding and linear layers.
        initrange = 0.1
        self.movie_embedding.weight.data.uniform_(-initrange, initrange)
        self.user_embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, user: Tensor, src_mask: Tensor = None) -> Tensor:
        # Embedding movie ids and userid
        movie_embed = self.movie_embedding(src) * math.sqrt(self.d_model)
        user_embed = self.user_embedding(user) * math.sqrt(self.d_model)

        # positional encoding
        movie_embed = self.pos_encoder(movie_embed)

        # generating output with final layers
        output = self.transformer_encoder(movie_embed, src_mask)

        # Expand user_embed tensor along the sequence length dimension
        user_embed = user_embed.expand(-1, output.size(1), -1)

        # Concatenate user embeddings with transformer output
        output = torch.cat((output, user_embed), dim=-1)

        output = self.linear(output)
        return output

## Define Model Parameters

In [43]:
no_of_tokens = len(movie_vocab)  # size of vocabulary
d_ntokens = len(movie_vocab)  # size of vocabulary
nusers = len(user_vocab)
emb_size = 128  # embedding dimension
d_hid = 128  # dimension of the feedforward network model
no_of_layers = 2  # number of ``nn.TransformerEncoderLayer``
no_of_heads = 2  # number of heads in ``nn.MultiheadAttention``
dropout = 0.2  # dropout probability



### Define Encoding Layer

In [44]:
global encoder_layers
d_ff = 2048 # Dimensionality of the hidden layer
encoder_layers = EncoderLayer(emb_size, no_of_heads, d_ff, dropout)

### Build the Custom Transformer Model

In [45]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TransformerModel(no_of_tokens, nusers, emb_size, no_of_heads, d_hid, no_of_layers, dropout).to(device)



In [None]:
criterion = nn.CrossEntropyLoss()
lr = 1.0  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

**For better performance custom Optimizer can used**

**Different Optimizer like ADAM can be experimented**



# Train & Evaluation

### Train Function

In [47]:
def train(model: nn.Module, train_iter, epoch) -> None:
    # Switch to training mode
    model.train()
    total_loss = 0.
    log_interval = 200
    start_time = time.time()

    for i, (movie_data, user_data) in enumerate(train_iter):
        # Load movie sequence and user id
        movie_data, user_data = movie_data.to(device), user_data.to(device)
        user_data = user_data.reshape(-1, 1)

        # Split movie sequence to inputs and targets
        inputs, targets = movie_data[:, :-1], movie_data[:, 1:]
        targets_flat = targets.reshape(-1)

        # Predict movies
        output = model(inputs, user_data)
        output_flat = output.reshape(-1, no_of_tokens)

        # Backpropogation process
        loss = criterion(output_flat, targets_flat)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        # Results
        if i % log_interval == 0 and i > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

### Evaluation

In [48]:
def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    # Switch the model to evaluation mode.
    # This is necessary for layers like dropout,
    model.eval()
    total_loss = 0.

    with torch.no_grad():
        for i, (movie_data, user_data) in enumerate(eval_data):
            # Load movie sequence and user id
            movie_data, user_data = movie_data.to(device), user_data.to(device)
            user_data = user_data.reshape(-1, 1)
            # Split movie sequence to inputs and targets
            inputs, targets = movie_data[:, :-1], movie_data[:, 1:]
            targets_flat = targets.reshape(-1)
            # Predict movies
            output = model(inputs, user_data)
            output_flat = output.reshape(-1, no_of_tokens)
            # Calculate loss
            loss = criterion(output_flat, targets_flat)
            total_loss += loss.item()
    return total_loss / (len(eval_data) - 1)

In [None]:
best_val_loss = float('inf')
epochs = 10

with TemporaryDirectory() as tempdir:
    best_model_params_path = os.path.join(tempdir, "best_model_params.pt")

    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()

        # Training
        train(model, train_iter, epoch)

        # Evaluation
        val_loss = evaluate(model, val_iter)

        # Compute the perplexity of the validation loss
        val_ppl = math.exp(val_loss)
        elapsed = time.time() - epoch_start_time

        # Results
        print('-' * 89)
        print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
            f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
        print('-' * 89)

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), best_model_params_path)

        scheduler.step()
    model.load_state_dict(torch.load(best_model_params_path)) # load best model states

| epoch   1 lr 0.95 | ms/batch 11.03 | loss  6.89 | ppl   979.49
| epoch   1 lr 0.95 | ms/batch 12.18 | loss  6.79 | ppl   886.12
| epoch   1 lr 0.95 | ms/batch 17.06 | loss  6.71 | ppl   823.77
| epoch   1 lr 0.95 | ms/batch 10.68 | loss  6.65 | ppl   776.06
| epoch   1 lr 0.95 | ms/batch 10.70 | loss  6.61 | ppl   741.48
| epoch   1 lr 0.95 | ms/batch 10.53 | loss  6.57 | ppl   714.58
| epoch   1 lr 0.95 | ms/batch 10.29 | loss  6.54 | ppl   689.59
| epoch   1 lr 0.95 | ms/batch 14.52 | loss  6.50 | ppl   662.63
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 22.28s | valid loss  6.52 | valid ppl   675.50
-----------------------------------------------------------------------------------------
| epoch   2 lr 0.90 | ms/batch 11.11 | loss  6.40 | ppl   600.36
| epoch   2 lr 0.90 | ms/batch 10.55 | loss  6.35 | ppl   570.86
| epoch   2 lr 0.90 | ms/batch 10.69 | loss  6.33 | ppl   562.77
| epoch   2 lr 0.90 | ms/batch 

# Recommendation Model

In [51]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,user_1,movie_1193,5,978300760
1,user_1,movie_661,3,978302109
2,user_1,movie_914,3,978301968
3,user_1,movie_3408,4,978300275
4,user_1,movie_2355,5,978824291


In [None]:
def getPopularMovies(df_ratings):
  # Calculate the number of ratings for each movie
  rating_counts = df_ratings['movie_id'].value_counts().reset_index()
  rating_counts.columns = ['movie_id', 'rating_count']

  # Get the most frequently rated movies
  min_ratings_threshold = rating_counts['rating_count'].quantile(0.95)

  # Filter movies based on the minimum number of ratings
  popular_movies = ratings.merge(rating_counts, on='movie_id')
  popular_movies = popular_movies[popular_movies['rating_count'] >= min_ratings_threshold]


  # Calculate the average rating for each movie
  average_ratings = popular_movies.groupby('movie_id')['rating'].mean().reset_index()
  # Get the top 10 rated movies
  top_10_movies = list(average_ratings.sort_values('rating', ascending=False).head(10).movie_id.values)
  return top_10_movies

top_10_movies = getPopularMovies(ratings)

# Movie id decoder
movie_vocab_itos = movie_vocab.get_itos()

# A placeholders to store results of recommendations
transformer_reco_results = list()
popular_reco_results = list()

# Get top 10 movies
k = 10
# Iterate over the validation data
for i, (movie_data, user_data) in enumerate(val_iter):
    # Feed the input and get the outputs
    movie_data, user_data = movie_data.to(device), user_data.to(device)
    user_data = user_data.reshape(-1, 1)
    inputs, targets = movie_data[:, :-1], movie_data[:, 1:]
    output = model(inputs, user_data)
    output_flat = output.reshape(-1, no_of_tokens)
    targets_flat = targets.reshape(-1)

    # Reshape the output_flat to get top predictions
    outputs = output_flat.reshape(output_flat.shape[0] // inputs.shape[1],
                                  inputs.shape[1],
                                  output_flat.shape[1])[: , -1, :]
    # k + len(inputs) = 13 movies obtained
    # In order to prevent to recommend already watched movies
    values, indices = outputs.topk(k + inputs.shape[1], dim=-1)

    for sub_sequence, sub_indice_org in zip(movie_data, indices):
        sub_indice_org = sub_indice_org.cpu().detach().numpy()
        sub_sequence = sub_sequence.cpu().detach().numpy()

        # Generate mask array to eliminate already watched movies
        mask = np.isin(sub_indice_org, sub_sequence[:-1], invert=True)

        # After masking get top k movies
        sub_indice = sub_indice_org[mask][:k]

        # Generate results array
        transformer_reco_result = np.isin(sub_indice, sub_sequence[-1]).astype(int)

        # Decode movie to search in popular movies
        target_movie_decoded = movie_vocab_itos[sub_sequence[-1]]
        popular_reco_result = np.isin(top_10_movies, target_movie_decoded).astype(int)

        transformer_reco_results.append(transformer_reco_result)
        popular_reco_results.append(popular_reco_result)

def get_movie_recom(user_id, movie_sequence, k=10):
    model.eval()
    input_sequence = movie_sequence[:-1]
    # Tokenize and numerically encode the user id and movie sequence
    user_tensor = torch.tensor(user_vocab_stoi[user_id])
    movie_tensor = torch.tensor([[movie_vocab_stoi[movie_id]] for movie_id in input_sequence])
    # Shape: [1, 1]
    user_tensor = user_tensor.unsqueeze(0).to(device)
    user_tensor = user_tensor.view(user_tensor.shape[0], 1)

    # Shape: [1, seq_length]
    movie_tensor = movie_tensor.unsqueeze(0).to(device)[0]
    movie_tensor = movie_tensor.view(1, movie_tensor.shape[0])

    # Pass the tensors through the model
    with torch.no_grad():
        predictions = model(movie_tensor, user_tensor)

    # The output is a probability distribution over the next movie.
    # Topk to get most probable movies
    values, indices = predictions.topk(k + len(input_sequence), dim=-1)
    # Eliminate already watched movies
    indices = [indice for indice in indices[-1, :][0] if indice not in movie_tensor][:k]
    predicted_movies = [movie_title_dict[movie_vocab.get_itos()[movie]] for movie in indices]
    return predicted_movies

74728

# Predict the Movies

In [None]:
row_iter = test_data_raw[59232]
print("List of Movies as Input Sequence:")
print("-" + "\n-".join([movie_title_dict[ea_movie] for ea_movie in row_iter[1][:-1]]))
print(f"\n Recomendations based on list of mentioned movies:\n")
for count,moviename in enumerate(get_movie_recom(row_iter[0],row_iter[1]), 1):
  print(count, moviename)





List of Movies as Input Sequence:
-Fast Times at Ridgemont High (1982)
-Caddyshack (1980)
-Toxic Avenger, The (1985)

 Recomendations based on list of mentioned movies:

1 Pee-wee's Big Adventure (1985)
2 Back to the Future (1985)
3 Muppets Take Manhattan, The (1984)
4 Ferris Bueller's Day Off (1986)
5 Breakfast Club, The (1985)
6 Ghostbusters (1984)
7 When Harry Met Sally... (1989)
8 Big (1988)
9 Risky Business (1983)
10 Little Mermaid, The (1989)


In [None]:
row_iter = test_data_raw[59233]
print("List of Movies as Input Sequence:")
print("-" + "\n-".join([movie_title_dict[ea_movie] for ea_movie in row_iter[1][:-1]]))
print(f"\n Recomendations based on list of mentioned movies:\n")
for count,moviename in enumerate(get_movie_recom(row_iter[0],row_iter[1]), 1):
  print(count, moviename)



List of Movies as Input Sequence:
-Nightmare on Elm Street, A (1984)
-Karate Kid, The (1984)
-Gods Must Be Crazy, The (1980)
Recomendations based on list of mentioned movies:
-
1 Little Shop of Horrors (1986)
2 Gremlins (1984)
3 Little Mermaid, The (1989)
4 Mr. Mom (1983)
5 Bram Stoker's Dracula (1992)
6 Fly, The (1986)
7 Mummy, The (1999)
8 Spaceballs (1987)
9 Good Morning, Vietnam (1987)
10 Halloween (1978)


# Evaluation Criteria

In [None]:
from sklearn.metrics import ndcg_score

# Since we have already sorted our recommendations
# An array that represent our recommendation scores is used.
representative_array = [[i for i in range(k, 0, -1)]] * len(transformer_reco_results)

for k in [3, 5, 10]:
  transformer_result = ndcg_score(transformer_reco_results,
                                  representative_array, k=k)
  popular_result = ndcg_score(popular_reco_results,
                              representative_array, k=k)

  print(f"Transformer NDCG result at top {k}: {round(transformer_result, 4)}")
  print(f"Popular recommendation NDCG result at top {k}: {round(popular_result, 4)}\n\n")


Transformer NDCG result at top 3: 0.0535
Popular recommendation NDCG result at top 3: 0.0046


Transformer NDCG result at top 5: 0.0685
Popular recommendation NDCG result at top 5: 0.0064


Transformer NDCG result at top 10: 0.0906
Popular recommendation NDCG result at top 10: 0.0095




In [None]:
tot = len(test_data_raw)

In [None]:
test_data_raw[59232]

array(['user_5283',
       list(['movie_3210', 'movie_3552', 'movie_3693', 'movie_2174'])],
      dtype=object)

In [None]:
[movie_title_dict[ea_movie] for ea_movie in row_iter[1][:-1]]

['Fast Times at Ridgemont High (1982)',
 'Caddyshack (1980)',
 'Toxic Avenger, The (1985)']

In [None]:
generate_recommendation(row_iter[0],row_iter[1])

["Pee-wee's Big Adventure (1985)",
 'Back to the Future (1985)',
 'Muppets Take Manhattan, The (1984)',
 "Ferris Bueller's Day Off (1986)",
 'Breakfast Club, The (1985)',
 'Ghostbusters (1984)',
 'When Harry Met Sally... (1989)',
 'Big (1988)',
 'Risky Business (1983)',
 'Little Mermaid, The (1989)']

In [None]:
row_iter = test_data_raw[59232]
print("List of Movies as Input Sequence:")
print("-" + "\n-".join([movie_title_dict[ea_movie] for ea_movie in row_iter[1][:-1]]))
print(f"\n Recomendations based on list of mentioned movies:\n")
for count,moviename in enumerate(generate_recommendation(row_iter[0],row_iter[1]), 1):
  print(count, moviename)

In [None]:
for each in tot:
  row_iter = test_data_raw[each]
