In [1]:
import sys
import os
import numpy as np
import zipfile
from tqdm import tqdm
import scrapbook as sb
from tempfile import TemporaryDirectory
# import tensorflow as tf

from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources 
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
# from recommenders.models.newsrec.models.nrms import NRMSModel
from recommenders.models.newsrec.io.mind_iterator import MINDIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set

print("System version: {}".format(sys.version))
# print("Tensorflow version: {}".format(tf.__version__))

System version: 3.8.16 (default, Mar  2 2023, 03:18:16) [MSC v.1916 64 bit (AMD64)]


In [2]:
import numpy as np

In [3]:
epochs = 5
seed = 42
batch_size = 32

# Options: demo, small, large
MIND_type = 'demo'

In [4]:
import os

# Specify the name of the directory
directory = "recommenders\data"

# Specify the current path
current_path = os.getcwd()

# Full path
data_path = os.path.join(current_path, directory)

# Create the new directory
if not os.path.exists(data_path):
    os.makedirs(data_path)

print(f"Data path is set to: {data_path}")


Data path is set to: C:\Users\Rija Farooqui\Desktop\news_recommendation\recommenders\recommenders\data


In [5]:
train_news_file = os.path.join(data_path, 'train', r'news.tsv')
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')
valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')
wordEmb_file = os.path.join(data_path, "utils", "embedding.npy")
userDict_file = os.path.join(data_path, "utils", "uid2index.pkl")
wordDict_file = os.path.join(data_path, "utils", "word_dict.pkl")
yaml_file = os.path.join(data_path, "utils", r'nrms.yaml')

mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)

In [6]:
# import pickle

# # specify the path to your .pkl file
# file_path = wordEmb_file

# # open and read the pickle file
# with open(file_path, 'rb') as f:
#     data = pickle.load(f)

# # print or inspect the content of the pickle file
# print(data)


In [7]:
if not os.path.exists(train_news_file):
    download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)
    
if not os.path.exists(valid_news_file):
    download_deeprec_resources(mind_url, \
                               os.path.join(data_path, 'valid'), mind_dev_dataset)
if not os.path.exists(yaml_file):
    download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/', \
                               os.path.join(data_path, 'utils'), mind_utils)

In [8]:
hparams = prepare_hparams(yaml_file, 
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file, 
                          userDict_file=userDict_file,
                          batch_size=1,
                          epochs=epochs,
                          show_step=10)
print(hparams)

HParams object with values {'support_quick_scoring': True, 'dropout': 0.2, 'attention_hidden_dim': 200, 'head_num': 20, 'head_dim': 20, 'filter_num': 200, 'window_size': 3, 'vert_emb_dim': 100, 'subvert_emb_dim': 100, 'gru_unit': 400, 'type': 'ini', 'user_emb_dim': 50, 'learning_rate': 0.0001, 'optimizer': 'adam', 'epochs': 5, 'batch_size': 1, 'show_step': 10, 'title_size': 30, 'his_size': 50, 'data_format': 'news', 'npratio': 4, 'metrics': ['group_auc', 'mean_mrr', 'ndcg@5;10'], 'word_emb_dim': 300, 'model_type': 'nrms', 'loss': 'cross_entropy_loss', 'wordEmb_file': 'C:\\Users\\Rija Farooqui\\Desktop\\news_recommendation\\recommenders\\recommenders\\data\\utils\\embedding.npy', 'wordDict_file': 'C:\\Users\\Rija Farooqui\\Desktop\\news_recommendation\\recommenders\\recommenders\\data\\utils\\word_dict.pkl', 'userDict_file': 'C:\\Users\\Rija Farooqui\\Desktop\\news_recommendation\\recommenders\\recommenders\\data\\utils\\uid2index.pkl'}


In [9]:
from recommenders.models.newsrec.io.mind_iterator import MINDIterator
iterator = MINDIterator

In [10]:
it = iterator(hparams, hparams.npratio, col_spliter="\t")
batches = it.load_data_from_file(train_news_file, train_behaviors_file)

In [11]:
for b in batches:
    print(b['candidate_title_batch'].shape)
    break

(1, 5, 30)


In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class AttLayer2(nn.Module):
    def __init__(self, dim=200):
        super(AttLayer2, self).__init__()

        self.dim = dim
        self.W = nn.Parameter(torch.randn(dim, dim))
        self.b = nn.Parameter(torch.randn(dim))
        self.q = nn.Parameter(torch.randn(dim, 1))

    def forward(self, x, mask=None):
        e = torch.tanh(torch.matmul(x, self.W) + self.b)
        e = torch.matmul(e, self.q)
        e = torch.squeeze(e, dim=-1)

        if mask is not None:
            e = e.masked_fill(mask==0, float('-inf'))

        a = torch.softmax(e, dim=-1)

        weighted_input = x * a.unsqueeze(-1)
        return torch.sum(weighted_input, dim=1)


class SelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, mask_right=False):
        super(SelfAttention, self).__init__()

        self.num_heads = num_heads
        self.embed_dim = embed_dim
        self.mask_right = mask_right

        self.W = nn.Linear(embed_dim, embed_dim)

        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)

    def forward(self, input_seq, input_seq_len):
        input_seq = self.W(input_seq)

        if self.mask_right:
            mask = torch.triu(torch.ones((input_seq_len, input_seq_len)), diagonal=1).bool().to(input_seq.device)
        else:
            mask = None

        attn_output, _ = self.multihead_attn(input_seq, input_seq, input_seq, attn_mask = mask)

        return attn_output


In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from transformers import AutoModel, AutoTokenizer




class NRMSModel(torch.nn.Module):
    def __init__(self, hparams, embed_model_name, freeze_embeddings=False):
        super().__init__()
        self.hparams = hparams
        self.embedding = AutoModel.from_pretrained(embed_model_name)

#         if freeze_embeddings:  # Freeze embeddings layer if specified
#             for param in self.embedding.parameters():
#                 param.requires_grad = False

        self.news_encoder = self._build_newsencoder()
        self.user_encoder = self._build_userencoder()
        
        self.dot = torch.nn.CosineSimilarity(dim=-1)
        self.softmax = torch.nn.Softmax(dim=-1)
        self.sigmoid = torch.nn.Sigmoid()
        
    def _build_newsencoder(self):
        return torch.nn.Sequential(
            torch.nn.Dropout(self.hparams.dropout),
            SelfAttention(self.hparams.head_num, self.hparams.head_dim),
            torch.nn.Dropout(self.hparams.dropout),
            AttLayer2(self.hparams.attention_hidden_dim)
        )
        
    def _build_userencoder(self):
        return torch.nn.Sequential(
            SelfAttention(self.hparams.head_num, self.hparams.head_dim),
            AttLayer2(self.hparams.attention_hidden_dim)
        )
        
    def forward(self, inputs):
        clicked_title_batch, candidate_title_batch = inputs
        clicked_title_batch = self.embedding(clicked_title_batch)
        candidate_title_batch = self.embedding(candidate_title_batch)
        
        user_present = self.user_encoder(clicked_title_batch)
        news_present = self.news_encoder(candidate_title_batch)
        
        preds = self.softmax(self.dot(news_present, user_present.unsqueeze(1)))
        
        return preds
    
    def score(self, clicked_title_batch, candidate_title_batch_one):
        clicked_title_batch = self.embedding(clicked_title_batch)
        candidate_title_batch_one = self.embedding(candidate_title_batch_one)
        
        user_present = self.user_encoder(clicked_title_batch)
        news_present_one = self.news_encoder(candidate_title_batch_one.squeeze(1))
        
        pred_one = self.sigmoid(self.dot(news_present_one, user_present))
        
        return pred_one
    def _get_input_label_from_iter(self, batch_data):
        input_feat = [
            batch_data["clicked_title_batch"],
            batch_data["candidate_title_batch"],
        ]
        input_label = batch_data["labels"]
        return input_feat, input_label

    def _get_user_feature_from_iter(self, batch_data):
        return batch_data["clicked_title_batch"]

    def _get_news_feature_from_iter(self, batch_data):
        return batch_data["candidate_title_batch"]

    def fit_model(self, dataloader, criterion, optimizer, num_epochs=10):
        self.train()  # set the model in train mode

        for epoch in range(num_epochs):
            running_loss = 0.0

            for i, batch_data in enumerate(dataloader):
                # Get the inputs and labels
                inputs, labels = self._get_input_label_from_iter(batch_data)
                inputs = [torch.tensor(inp).to(device) for inp in inputs]
                labels = torch.tensor(labels).float().to(device)

                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward pass
                outputs = self.forward(inputs)

                # Calculate loss
                loss = criterion(outputs, labels)

                # Backward pass and optimization
                loss.backward()
                optimizer.step()

                running_loss += loss.item()

            # Print statistics
            epoch_loss = running_loss / len(dataloader)
            print(f'Epoch {epoch+1}, Loss: {epoch_loss:.4f}')

        print('Finished Training')
model = NRMSModel(hparams, 'bert-base-uncased')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
import torch.optim as optim
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = nn.CrossEntropyLoss()  # or any other suitable loss function
optimizer = optim.Adam(model.parameters(), lr=hparams.learning_rate)

In [24]:
model.fit_model(batches, criterion, optimizer)

In [15]:
# Training Loop
for epoch in range(hparams.num_epochs):
    model.train()  # Switch to training mode
    train_loss = 0.0
    for i, batch in enumerate(train_loader):
        optimizer.zero_grad()  # Clear gradients

        clicked_title_batch = batch['clicked_title_batch'].to(device)
        candidate_title_batch = batch['candidate_title_batch'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(clicked_title_batch, candidate_title_batch)  # Forward pass

        loss = criterion(outputs, labels)  # Compute loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights

        train_loss += loss.item()

    # Print training statistics
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(epoch, train_loss / len(train_loader)))

    # Validate the model
    model.eval()  # Switch to evaluation mode
    valid_loss = 0.0
    with torch.no_grad():
        for i, batch in enumerate(valid_loader):
            clicked_title_batch = batch['clicked_title_batch'].to(device)
            candidate_title_batch = batch['candidate_title_batch'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(clicked_title_batch, candidate_title_batch)  # Forward pass

            loss = criterion(outputs, labels)  # Compute loss
            valid_loss += loss.item()

    # Print validation statistics
    print('Epoch: {} \tValidation Loss: {:.6f}'.format(epoch, valid_loss / len(valid_loader)))

In [None]:
model = NRMSModel(hparams, iterator, seed=seed)

In [None]:
print(model.run_eval(valid_news_file, valid_behaviors_file))

In [None]:
%%time
model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file)

In [None]:
%%time
res_syn = model.run_eval(valid_news_file, valid_behaviors_file)
print(res_syn)