In [1]:
import pandas as pd
from transformers import (
    DPRContextEncoder,
    DPRQuestionEncoder,
    DPRContextEncoderTokenizer,
    DPRQuestionEncoderTokenizer,
)
import torch.nn.functional as F
import torch
import numpy as np
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

In [3]:
device

device(type='cpu')

# Train

In [4]:
import torch
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [5]:
class CustomDataset(Dataset):

    def __init__(self, data, question_tokenizer, context_tokenizer):
        self.data = data
        self.question_tokenizer = question_tokenizer
        self.context_tokenizer = context_tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        job_description = self.data["job_description"].iloc[index]
        resume = self.data["resume"].iloc[index]

        tokenized_question = self.question_tokenizer(job_description, return_tensors="pt", padding="max_length", max_length=512, truncation=True)
        tokenized_context = self.context_tokenizer(resume, return_tensors="pt", padding="max_length", max_length=512, truncation=True)

        question_input_ids = torch.flatten(tokenized_question["input_ids"])
        question_attention_mask = torch.flatten(tokenized_question["attention_mask"])

        context_input_ids = torch.flatten(tokenized_context["input_ids"])
        context_attention_mask = torch.flatten(tokenized_context["attention_mask"])
        return (
            question_input_ids,
            question_attention_mask,
            context_input_ids,
            context_attention_mask,
        )

In [6]:
class Model(nn.Module):
    def __init__(self, question_encoder, context_encoder):
        super().__init__()
        self.question_encoder = question_encoder
        self.context_encoder = context_encoder

    def forward(self, question_ids, question_att_mask, context_ids, context_att_mask):
        question_output = self.question_encoder(
            question_ids.to(device), question_att_mask.to(device)
        ).pooler_output
        context_output = self.context_encoder(
            context_ids.to(device), context_att_mask.to(device)
        ).pooler_output
        return question_output, context_output

In [7]:
def train(
    model,
    model_path,
    optimizer,
    criterion,
    train_loader,
    n_epochs,
):
    # number of epochs to train the model
    n_epochs = n_epochs
    best_f1 = 0.0

    for epoch in range(n_epochs):
        # monitor training loss
        train_loss = 0.0

        ###################
        # train the model #
        ###################
        model.train()  # prep model for training
        for question_ids, question_att_mask, context_ids, context_att_mask in train_loader:
            # clear the gradients of all optimized variables
            optimizer.zero_grad()
            # forward pass: compute predicted outputs by passing inputs to the model
            # inputs = [question_ids, question_att_mask, context_ids, context_att_mask]
            question_output, context_output = model(
                question_ids, question_att_mask, context_ids, context_att_mask
            )
            # calculate the loss
            loss = criterion(question_output, context_output)
            # backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()
            # perform a single optimization step (parameter update)
            optimizer.step()
            # update running training loss
            train_loss += loss.item()

        print(f"Epoch {epoch + 1}")
        print("----------------------------")
        print(f"Training Loss: {train_loss / len(train_loader.dataset)}")
        # accuracy, precision, recall, f1 = eval_fn(
        #     device, model, eval_loader, x_eval, eval_writer
        # )
        # if f1 > best_f1:
        #     best_f1 = f1
        #     torch.save(model.state_dict(), model_path)
        # print(
        #     f"Evaluation Metrics: Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}"
        # )

In [8]:
def loss(question_output, context_output):
    S = torch.matmul(question_output, context_output.T)
    softmax = F.softmax(S, dim=1)
    diagonal = torch.diagonal(softmax)
    log_diagonal = -torch.log(diagonal)
    loss = torch.mean(log_diagonal)
    return loss

In [9]:
df_train = pd.read_json("./data_trial.json")
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
    "facebook/dpr-question_encoder-single-nq-base"
)
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
    "facebook/dpr-ctx_encoder-single-nq-base"
)
train_dataset = CustomDataset(df_train, question_tokenizer, context_tokenizer)
train_dataloader = DataLoader(
    train_dataset, batch_size=32, shuffle=True,
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


In [10]:
question_encoder = DPRQuestionEncoder.from_pretrained(
    "facebook/dpr-question_encoder-single-nq-base"
).to(device)

context_encoder = DPRContextEncoder.from_pretrained(
    "facebook/dpr-ctx_encoder-single-nq-base"
).to(device)

model = Model(question_encoder, context_encoder).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
criterion = loss
train(model, "model.pth", optimizer, criterion, train_dataloader, 10)

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the

Epoch 1
----------------------------
Training Loss: nan


KeyboardInterrupt: 

In [None]:
# 512 token max length
# 14 characters -> 3 tokens
# num characters: 510 * 14/3 = 2380
# num words: 366 to 476

# Test

In [None]:
# def test(question_encoder, context_encoder):
#     with open("test_data.json", "r") as f:
#         test_data = json.load(f)
#     question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
#         "facebook/dpr-question_encoder-single-nq-base"
#     )
#     context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
#         "facebook/dpr-ctx_encoder-single-nq-base"
#     )
#     for data in test_data:
#         label = next(iter(data))
#         pos = data[label]["pos"]
#         neg = data[label]["neg"]
#         all = pos + neg
#         # Tokenize the question and the context
#         tokenized_question = question_tokenizer(
#             label, return_tensors="pt", padding="max_length", max_length=512, truncation=True
#         )
#         question_input_ids = tokenized_question["input_ids"]
#         question_attention_mask = tokenized_question["attention_mask"]

#         tokenized_context = context_tokenizer(
#             all, return_tensors="pt", padding="max_length", max_length=512, truncation=True
#         )
#         context_input_ids = tokenized_context["input_ids"]
#         context_attention_mask = tokenized_context["attention_mask"]

#         # Encode the question and the context
#         question_output = question_encoder(question_input_ids.to(device), question_attention_mask.to(device)).pooler_output
#         context_output = context_encoder(context_input_ids.to(device), context_attention_mask.to(device)).pooler_output
#         scores = F.cosine_similarity(question_output, context_output)
#         _, indices = torch.topk(scores, 5)
#         relevant_passages = np.array(all)[indices.cpu().numpy()]
#         num_correct = 0
#         for p in relevant_passages:
#             if p in pos:
#                 num_correct += 1
#         print(f"Accuracy ({label}): {num_correct}/{len(relevant_passages)}")

In [None]:
# question_encoder = DPRQuestionEncoder.from_pretrained(
#     "facebook/dpr-question_encoder-single-nq-base"
# ).to(device)
# context_encoder = DPRContextEncoder.from_pretrained(
#     "facebook/dpr-ctx_encoder-single-nq-base"
# ).to(device)
# test(question_encoder, context_encoder)

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the

Python 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0]
Type 'copyright', 'credits' or 'license' for more information
IPython 8.20.0 -- An enhanced Interactive Python. Type '?' for help.

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[1], line 1
----> 1 question_inputs_ids

NameError: name 'question_inputs_ids' is not defined

Out[2]: 
tensor([[ 101, 9262, 1035, 9722,  102,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0