# YouTube Recommender

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict, deque

from livelossplot import PlotLosses
import torch
import torch.nn as nn
import torch.optim as optim

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Candidate Generation Network

**Task 1.** Code the Deep Candidate Generation Network as described in the Covington, Adams, Sargin, Deep Neural Networks for YouTube Recommendations. The network should accept the following parameters in the init method:

- n_items - dictionary size (the number of all items), 
- embedding_dim - item embedding dimension, 
- hidden_dim - last fully connected hidden layer size (user embedding dim), 
- n_hidden_layers - the number of hidden layers, 
- seed - seed for the random number generator.

The input to the network has the following form:

```
user_histories = [
    torch.LongTensor([7, 3, 9]),
    torch.LongTensor([11]),
    torch.LongTensor([0]),
    torch.LongTensor([10, 1, 4, 8])
]
```

It's a batch of item interaction histories for several users. The histories can have different lengths.

The architecture of the network should be as follows:

- In the first step you can loop over elements of the batch.
- The embedding layer should be applied to every user's history.
- An average embedding should be calculated for the entire user's history.
- The entire batch of those averaged embeddings should be fed forward through hidden layers. There should be n_hidden_layers hidden layers. The last hidden layer should have hidden_dim output neurons. Every previous hidden layer should have twice as many output neurons as the next hidden layer. The first hidden layer must have the input dimension compatible with the result of operations described in previous bullets.
- The final layer should have hidden_dim input neurons and one output neuron per item. Finally, softmax should be applied on this layer.
- All layers should not have the bias term.

In [None]:
class DeepCandidateGenerationModel(nn.Module):
    """
    Extreme multi-class classifier network.
    """
    def __init__(self, n_items, embedding_dim, hidden_dim, n_hidden_layers, seed):
        super().__init__()

        self.seed = torch.manual_seed(seed)
        ########################
        # Write your code here #
        ########################


    def forward(self, user_histories):
        ########################
        # Write your code here #
        ########################


        return x

In [None]:
# Test

dcg_network = DeepCandidateGenerationModel(n_items=12, embedding_dim=3, hidden_dim=6, n_hidden_layers=4, seed=6789)

print(dcg_network)

user_histories = [
    torch.LongTensor([7, 3, 9]),
    torch.LongTensor([11]),
    torch.LongTensor([0]),
    torch.LongTensor([10, 1, 4, 8])
]

result = dcg_network(user_histories)
print(result)

assert (np.round(np.array(result.tolist()), 4) == np.round(
    np.array([[0.0814482569694519, 0.08233463764190674, 0.08382046967744827, 0.08316171914339066, 0.08304114639759064, 
               0.08322826772928238, 0.08181853592395782, 0.08334977924823761, 0.08427998423576355, 0.0858381986618042, 
               0.08428454399108887, 0.08339447528123856], 
              [0.08161208778619766, 0.0823715552687645, 0.08370236307382584, 0.0829075276851654, 0.08293760567903519, 
               0.08351287990808487, 0.08229964226484299, 0.08368352055549622, 0.08461923152208328, 0.08492446690797806, 
               0.08417494595050812, 0.08325410634279251], 
              [0.07629935443401337, 0.07855911552906036, 0.08342088758945465, 0.07980967313051224, 0.08325998485088348, 
               0.08542384952306747, 0.07999198138713837, 0.086661696434021, 0.08794154226779938, 0.0872378796339035, 
               0.08619119971990585, 0.08520283550024033], 
              [0.08132115006446838, 0.08194458484649658, 0.08329392224550247, 0.08231865614652634, 0.08351138979196548, 
               0.08392892777919769, 0.08234265446662903, 0.08428220450878143, 0.08438169211149216, 0.0844755545258522, 
               0.08409841358661652, 0.08410079777240753]]), 4)).all()

# Ranking Network

**Task 2.** Code the Deep Ranking Network as described in the Covington, Adams, Sargin, Deep Neural Networks for YouTube Recommendations. The network should accept the following parameters in the init method:

- n_items - dictionary size (the number of all items), 
- embedding_dim - item embedding dimension, 
- hidden_dim - last fully connected hidden layer size (user embedding dim), 
- n_hidden_layers - the number of hidden layers, 
- seed - seed for the random number generator.

The input to the network has the following form:

```
user_history = torch.LongTensor([7, 3, 9])

scored_item = torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
```

It's an item interaction history for a single user and a batch of item ids to be scored.

The architecture of the network should be as follows:

- The embedding layer should be applied to user's history.
- An average embedding should be calculated for user's history.
- The embedding layer should be applied to the batch of scored items.
- The user's history average embedding should be concatenated from the left side to every scored item embedding in the batch.
- The entire batch of those concatenated embeddings should be fed forward through hidden layers. There should be n_hidden_layers hidden layers. The last hidden layer should have hidden_dim output neurons. Every previous hidden layer should have twice as many output neurons as the next hidden layer. The first hidden layer must have the input dimension compatible with the result of operations described in previous bullets. Hidden layers should not have the bias term.
- The final layer should have hidden_dim input neurons and one output neuron. This layer should have the bias term. Finally, sigmoid should be applied on this layer.

In [None]:
class DeepRankingModel(nn.Module):
    """
    Ranking network.
    """
    def __init__(self, n_items, embedding_dim, hidden_dim, n_hidden_layers, seed):
        super().__init__()

        self.seed = torch.manual_seed(seed)
        ########################
        # Write your code here #
        ########################


    def forward(self, user_history, scored_item):
        ########################
        # Write your code here #
        ########################


        return x

In [None]:
# Test

dr_network = DeepRankingModel(n_items=12, embedding_dim=3, hidden_dim=6, n_hidden_layers=4, seed=6789)

print(dr_network)

user_history = torch.LongTensor([7, 3, 9])

scored_item = torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])

result = dr_network(user_history, scored_item)
print(result)

assert all(np.round(np.array(result.tolist()), 4) == np.round(
    np.array([[0.4069308638572693], [0.407929003238678], [0.4055391550064087], [0.40804415941238403], [0.4105871021747589], 
              [0.4065965712070465], [0.40611234307289124], [0.40995174646377563], [0.40362346172332764], 
              [0.4062325656414032], [0.4094473421573639], [0.41004639863967896]]), 4))