<a href="https://colab.research.google.com/github/sujaygarlanka/544-final-project/blob/master/DPR_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from transformers import (
    DPRContextEncoder,
    DPRQuestionEncoder,
    DPRContextEncoderTokenizer,
    DPRQuestionEncoderTokenizer,
)
import torch.nn.functional as F
import torch
import numpy as np
import json
import gc
from tqdm import tqdm
# import torch_xla.core.xla_model as xm

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [None]:
def memory_stats():
    print(torch.cuda.memory_allocated()/1024**2)
    print(torch.cuda.memory_cached()/1024**2)

In [None]:
def clear_memory():
  gc.collect()
  torch.cuda.empty_cache()
  memory_stats()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
# device = xm.xla_device()

In [None]:
device

device(type='cuda')

In [None]:
DIR = "./544-final-project"

# Train

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [None]:
class CustomDataset(Dataset):

    def __init__(self, data, question_tokenizer, context_tokenizer):
        self.data = data
        self.question_tokenizer = question_tokenizer
        self.context_tokenizer = context_tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        job_description = self.data["job_description"].iloc[index]
        resume = self.data["resume"].iloc[index]

        tokenized_question = self.question_tokenizer(job_description, return_tensors="pt", padding="max_length", max_length=512, truncation=True)
        tokenized_context = self.context_tokenizer(resume, return_tensors="pt", padding="max_length", max_length=512, truncation=True)

        question_input_ids = torch.flatten(tokenized_question["input_ids"])
        question_attention_mask = torch.flatten(tokenized_question["attention_mask"])

        context_input_ids = torch.flatten(tokenized_context["input_ids"])
        context_attention_mask = torch.flatten(tokenized_context["attention_mask"])
        return (
            question_input_ids,
            question_attention_mask,
            context_input_ids,
            context_attention_mask,
        )

In [None]:
class Model(nn.Module):
    def __init__(self, question_encoder, context_encoder):
        super().__init__()
        self.question_encoder = question_encoder
        self.context_encoder = context_encoder

    def forward(self, question_ids, question_att_mask, context_ids, context_att_mask):
        question_output = self.question_encoder(
            question_ids.to(device), question_att_mask.to(device)
        ).pooler_output
        context_output = self.context_encoder(
            context_ids.to(device), context_att_mask.to(device)
        ).pooler_output
        return question_output, context_output

    def save(self, path):
        self.question_encoder.save_pretrained(path + "/finetune_question_encoder")
        self.context_encoder.save_pretrained(path + "/finetune_context_encoder")

In [None]:
def train(
    model,
    model_path,
    optimizer,
    criterion,
    train_loader,
    n_epochs,
    eval_fn
):
    # number of epochs to train the model
    n_epochs = n_epochs
    best_accuracy = -1.0

    for epoch in range(n_epochs):
        # monitor training loss
        train_loss = 0.0

        ###################
        # train the model #
        ###################
        model.train()  # prep model for training
        with tqdm(train_loader, unit="batch", position=0, leave=True) as tepoch:
          for question_ids, question_att_mask, context_ids, context_att_mask in tqdm(tepoch, position=0, leave=True):
              tepoch.set_description(f"Epoch {epoch + 1}")
              # clear the gradients of all optimized variables
              optimizer.zero_grad()
              # forward pass: compute predicted outputs by passing inputs to the model
              # inputs = [question_ids, question_att_mask, context_ids, context_att_mask]
              question_output, context_output = model(
                  question_ids, question_att_mask, context_ids, context_att_mask
              )
              # calculate the loss
              loss = criterion(question_output, context_output)
              # backward pass: compute gradient of the loss with respect to model parameters
              loss.backward()
              # perform a single optimization step (parameter update)
              optimizer.step()
              # update running training loss
              train_loss += loss.item() * question_ids.size(0)
              tepoch.set_postfix(loss=train_loss)

        accuracy = eval_fn(model)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            model.save(model_path)

In [None]:
def loss(question_output, context_output):
    S = torch.matmul(question_output, context_output.T)
    softmax = F.softmax(S, dim=1)
    diagonal = torch.diagonal(softmax)
    diagonal = torch.add(diagonal, torch.tensor([0.0000000001]).to(device))
    log_diagonal = -torch.log(diagonal)
    loss = torch.mean(log_diagonal)
    return loss

In [None]:
def evaluate(model):
    model.eval()
    question_encoder = model.question_encoder
    context_encoder = model.context_encoder
    with open(f"{DIR}/data/valid_data.json", "r") as f:
        test_data = json.load(f)
    question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
        "facebook/dpr-question_encoder-single-nq-base"
    )
    context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
        "facebook/dpr-ctx_encoder-single-nq-base"
    )
    total_num_correct = 0
    total_num_pos = 0
    for data in test_data:
        label = data["label"]
        job_description = data["description"]
        pos = data["pos"]
        neg = data["neg"]
        all = pos + neg
        # Tokenize the question and the context
        tokenized_question = question_tokenizer(
            job_description,
            return_tensors="pt",
            padding="max_length",
            max_length=512,
            truncation=True,
        )
        question_input_ids = tokenized_question["input_ids"]
        question_attention_mask = tokenized_question["attention_mask"]

        tokenized_context = context_tokenizer(
            all,
            return_tensors="pt",
            padding="max_length",
            max_length=512,
            truncation=True,
        )
        context_input_ids = tokenized_context["input_ids"]
        context_attention_mask = tokenized_context["attention_mask"]

        # Encode the question and the context
        question_output = question_encoder(
            question_input_ids.to(device), question_attention_mask.to(device)
        ).pooler_output
        context_output = context_encoder(
            context_input_ids.to(device), context_attention_mask.to(device)
        ).pooler_output
        scores = F.cosine_similarity(question_output, context_output)
        _, indices = torch.topk(scores, 5)
        relevant_passages = np.array(all)[indices.cpu().numpy()]
        num_correct = 0
        for p in relevant_passages:
            if p in pos:
                num_correct += 1
        total_num_correct += num_correct
        total_num_pos += 5
        print(f"Accuracy ({label}): {num_correct}/{len(relevant_passages)}")
    print(f"Total accuracy: {total_num_correct}/{total_num_pos}")
    return total_num_correct / total_num_pos

In [None]:
df_train = pd.read_json(f"{DIR}/data/data.json")
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
    "facebook/dpr-question_encoder-single-nq-base"
)
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
    "facebook/dpr-ctx_encoder-single-nq-base"
)
train_dataset = CustomDataset(df_train, question_tokenizer, context_tokenizer)
train_dataloader = DataLoader(
    train_dataset, batch_size=10, shuffle=True,
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


In [None]:
question_encoder = DPRQuestionEncoder.from_pretrained(
    "facebook/dpr-question_encoder-single-nq-base"
).to(device)

context_encoder = DPRContextEncoder.from_pretrained(
    "facebook/dpr-ctx_encoder-single-nq-base"
).to(device)

model = Model(question_encoder, context_encoder).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
criterion = loss
eval_fn = evaluate

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the

In [None]:
train(model, f"{DIR}/models", optimizer, criterion, train_dataloader, 10, eval_fn)

Epoch 1: 100%|██████████| 99/99 [01:17<00:00,  1.28batch/s, loss=6.53e+3]
100%|██████████| 99/99 [01:17<00:00,  1.28it/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


Accuracy (Security_Analyst): 2/5
Accuracy (Systems_Administrator): 1/5
Accuracy (Project_manager): 1/5
Accuracy (Database_Administrator): 3/5
Accuracy (Software_Developer): 2/5
Accuracy (Front_End_Developer): 3/5
Accuracy (Web_Developer): 4/5
Accuracy (Java_Developer): 5/5
Accuracy (Network_Administrator): 3/5
Accuracy (Python_Developer): 2/5
Accuracy (Security_Analyst): 5/5
Accuracy (Systems_Administrator): 2/5
Accuracy (Project_manager): 3/5
Accuracy (Database_Administrator): 0/5
Accuracy (Software_Developer): 3/5
Accuracy (Front_End_Developer): 4/5
Accuracy (Web_Developer): 3/5
Accuracy (Java_Developer): 1/5
Accuracy (Network_Administrator): 4/5
Accuracy (Python_Developer): 1/5
Accuracy (Security_Analyst): 5/5
Accuracy (Systems_Administrator): 1/5
Accuracy (Project_manager): 0/5
Accuracy (Database_Administrator): 1/5
Accuracy (Software_Developer): 4/5
Accuracy (Front_End_Developer): 4/5
Accuracy (Web_Developer): 5/5
Accuracy (Java_Developer): 2/5
Accuracy (Network_Administrator): 3/

Epoch 2: 100%|██████████| 99/99 [01:17<00:00,  1.28batch/s, loss=4.34e+3]
100%|██████████| 99/99 [01:17<00:00,  1.28it/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


Accuracy (Security_Analyst): 5/5
Accuracy (Systems_Administrator): 5/5
Accuracy (Project_manager): 3/5
Accuracy (Database_Administrator): 5/5
Accuracy (Software_Developer): 1/5
Accuracy (Front_End_Developer): 1/5
Accuracy (Web_Developer): 5/5
Accuracy (Java_Developer): 3/5
Accuracy (Network_Administrator): 5/5
Accuracy (Python_Developer): 4/5
Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 1/5
Accuracy (Project_manager): 5/5
Accuracy (Database_Administrator): 3/5
Accuracy (Software_Developer): 3/5
Accuracy (Front_End_Developer): 4/5
Accuracy (Web_Developer): 3/5
Accuracy (Java_Developer): 4/5
Accuracy (Network_Administrator): 0/5
Accuracy (Python_Developer): 3/5
Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 4/5
Accuracy (Project_manager): 5/5
Accuracy (Database_Administrator): 3/5
Accuracy (Software_Developer): 4/5
Accuracy (Front_End_Developer): 3/5
Accuracy (Web_Developer): 5/5
Accuracy (Java_Developer): 4/5
Accuracy (Network_Administrator): 2/

Epoch 3: 100%|██████████| 99/99 [01:17<00:00,  1.28batch/s, loss=2.77e+3]
100%|██████████| 99/99 [01:17<00:00,  1.28it/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


Accuracy (Security_Analyst): 2/5
Accuracy (Systems_Administrator): 3/5
Accuracy (Project_manager): 4/5
Accuracy (Database_Administrator): 1/5
Accuracy (Software_Developer): 1/5
Accuracy (Front_End_Developer): 3/5
Accuracy (Web_Developer): 3/5
Accuracy (Java_Developer): 3/5
Accuracy (Network_Administrator): 3/5
Accuracy (Python_Developer): 2/5
Accuracy (Security_Analyst): 2/5
Accuracy (Systems_Administrator): 1/5
Accuracy (Project_manager): 1/5
Accuracy (Database_Administrator): 1/5
Accuracy (Software_Developer): 3/5
Accuracy (Front_End_Developer): 3/5
Accuracy (Web_Developer): 1/5
Accuracy (Java_Developer): 3/5
Accuracy (Network_Administrator): 2/5
Accuracy (Python_Developer): 3/5
Accuracy (Security_Analyst): 4/5
Accuracy (Systems_Administrator): 3/5
Accuracy (Project_manager): 1/5
Accuracy (Database_Administrator): 1/5
Accuracy (Software_Developer): 2/5
Accuracy (Front_End_Developer): 3/5
Accuracy (Web_Developer): 1/5
Accuracy (Java_Developer): 2/5
Accuracy (Network_Administrator): 2/

Epoch 4: 100%|██████████| 99/99 [01:17<00:00,  1.28batch/s, loss=2.44e+3]
100%|██████████| 99/99 [01:17<00:00,  1.28it/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


Accuracy (Security_Analyst): 0/5
Accuracy (Systems_Administrator): 2/5
Accuracy (Project_manager): 1/5
Accuracy (Database_Administrator): 5/5
Accuracy (Software_Developer): 5/5
Accuracy (Front_End_Developer): 2/5
Accuracy (Web_Developer): 5/5
Accuracy (Java_Developer): 4/5
Accuracy (Network_Administrator): 3/5
Accuracy (Python_Developer): 5/5
Accuracy (Security_Analyst): 5/5
Accuracy (Systems_Administrator): 5/5
Accuracy (Project_manager): 4/5
Accuracy (Database_Administrator): 5/5
Accuracy (Software_Developer): 1/5
Accuracy (Front_End_Developer): 2/5
Accuracy (Web_Developer): 5/5
Accuracy (Java_Developer): 5/5
Accuracy (Network_Administrator): 5/5
Accuracy (Python_Developer): 5/5
Accuracy (Security_Analyst): 2/5
Accuracy (Systems_Administrator): 5/5
Accuracy (Project_manager): 3/5
Accuracy (Database_Administrator): 5/5
Accuracy (Software_Developer): 0/5
Accuracy (Front_End_Developer): 4/5
Accuracy (Web_Developer): 5/5
Accuracy (Java_Developer): 5/5
Accuracy (Network_Administrator): 5/

Epoch 5: 100%|██████████| 99/99 [01:16<00:00,  1.29batch/s, loss=2.35e+3]
100%|██████████| 99/99 [01:16<00:00,  1.29it/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


Accuracy (Security_Analyst): 5/5
Accuracy (Systems_Administrator): 4/5
Accuracy (Project_manager): 4/5
Accuracy (Database_Administrator): 4/5
Accuracy (Software_Developer): 3/5
Accuracy (Front_End_Developer): 3/5
Accuracy (Web_Developer): 1/5
Accuracy (Java_Developer): 0/5
Accuracy (Network_Administrator): 3/5
Accuracy (Python_Developer): 4/5
Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 1/5
Accuracy (Project_manager): 4/5
Accuracy (Database_Administrator): 1/5
Accuracy (Software_Developer): 2/5
Accuracy (Front_End_Developer): 3/5
Accuracy (Web_Developer): 1/5
Accuracy (Java_Developer): 5/5
Accuracy (Network_Administrator): 3/5
Accuracy (Python_Developer): 5/5
Accuracy (Security_Analyst): 2/5
Accuracy (Systems_Administrator): 3/5
Accuracy (Project_manager): 2/5
Accuracy (Database_Administrator): 2/5
Accuracy (Software_Developer): 3/5
Accuracy (Front_End_Developer): 4/5
Accuracy (Web_Developer): 3/5
Accuracy (Java_Developer): 2/5
Accuracy (Network_Administrator): 1/

Epoch 6: 100%|██████████| 99/99 [01:16<00:00,  1.29batch/s, loss=2.33e+3]
100%|██████████| 99/99 [01:16<00:00,  1.29it/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 2/5
Accuracy (Project_manager): 3/5
Accuracy (Database_Administrator): 2/5
Accuracy (Software_Developer): 1/5
Accuracy (Front_End_Developer): 4/5
Accuracy (Web_Developer): 4/5
Accuracy (Java_Developer): 2/5
Accuracy (Network_Administrator): 4/5
Accuracy (Python_Developer): 4/5
Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 4/5
Accuracy (Project_manager): 4/5
Accuracy (Database_Administrator): 3/5
Accuracy (Software_Developer): 3/5
Accuracy (Front_End_Developer): 5/5
Accuracy (Web_Developer): 0/5
Accuracy (Java_Developer): 4/5
Accuracy (Network_Administrator): 2/5
Accuracy (Python_Developer): 3/5
Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 0/5
Accuracy (Project_manager): 0/5
Accuracy (Database_Administrator): 2/5
Accuracy (Software_Developer): 2/5
Accuracy (Front_End_Developer): 3/5
Accuracy (Web_Developer): 2/5
Accuracy (Java_Developer): 2/5
Accuracy (Network_Administrator): 2/

Epoch 7: 100%|██████████| 99/99 [01:16<00:00,  1.30batch/s, loss=2.32e+3]
100%|██████████| 99/99 [01:16<00:00,  1.30it/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


Accuracy (Security_Analyst): 5/5
Accuracy (Systems_Administrator): 5/5
Accuracy (Project_manager): 4/5
Accuracy (Database_Administrator): 5/5
Accuracy (Software_Developer): 5/5
Accuracy (Front_End_Developer): 5/5
Accuracy (Web_Developer): 3/5
Accuracy (Java_Developer): 5/5
Accuracy (Network_Administrator): 5/5
Accuracy (Python_Developer): 4/5
Accuracy (Security_Analyst): 5/5
Accuracy (Systems_Administrator): 5/5
Accuracy (Project_manager): 4/5
Accuracy (Database_Administrator): 4/5
Accuracy (Software_Developer): 4/5
Accuracy (Front_End_Developer): 5/5
Accuracy (Web_Developer): 4/5
Accuracy (Java_Developer): 1/5
Accuracy (Network_Administrator): 3/5
Accuracy (Python_Developer): 5/5
Accuracy (Security_Analyst): 2/5
Accuracy (Systems_Administrator): 4/5
Accuracy (Project_manager): 3/5
Accuracy (Database_Administrator): 5/5
Accuracy (Software_Developer): 5/5
Accuracy (Front_End_Developer): 5/5
Accuracy (Web_Developer): 5/5
Accuracy (Java_Developer): 4/5
Accuracy (Network_Administrator): 2/

Epoch 8: 100%|██████████| 99/99 [01:16<00:00,  1.29batch/s, loss=2.32e+3]
100%|██████████| 99/99 [01:16<00:00,  1.29it/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


Accuracy (Security_Analyst): 1/5
Accuracy (Systems_Administrator): 4/5
Accuracy (Project_manager): 2/5
Accuracy (Database_Administrator): 5/5
Accuracy (Software_Developer): 4/5
Accuracy (Front_End_Developer): 4/5
Accuracy (Web_Developer): 4/5
Accuracy (Java_Developer): 4/5
Accuracy (Network_Administrator): 5/5
Accuracy (Python_Developer): 4/5
Accuracy (Security_Analyst): 5/5
Accuracy (Systems_Administrator): 3/5
Accuracy (Project_manager): 2/5
Accuracy (Database_Administrator): 4/5
Accuracy (Software_Developer): 4/5
Accuracy (Front_End_Developer): 2/5
Accuracy (Web_Developer): 3/5
Accuracy (Java_Developer): 5/5
Accuracy (Network_Administrator): 4/5
Accuracy (Python_Developer): 3/5
Accuracy (Security_Analyst): 2/5
Accuracy (Systems_Administrator): 3/5
Accuracy (Project_manager): 2/5
Accuracy (Database_Administrator): 4/5
Accuracy (Software_Developer): 1/5
Accuracy (Front_End_Developer): 1/5
Accuracy (Web_Developer): 4/5
Accuracy (Java_Developer): 4/5
Accuracy (Network_Administrator): 1/

Epoch 9: 100%|██████████| 99/99 [01:16<00:00,  1.29batch/s, loss=2.31e+3]
100%|██████████| 99/99 [01:16<00:00,  1.29it/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 5/5
Accuracy (Project_manager): 1/5
Accuracy (Database_Administrator): 4/5
Accuracy (Software_Developer): 3/5
Accuracy (Front_End_Developer): 4/5
Accuracy (Web_Developer): 1/5
Accuracy (Java_Developer): 5/5
Accuracy (Network_Administrator): 3/5
Accuracy (Python_Developer): 4/5
Accuracy (Security_Analyst): 1/5
Accuracy (Systems_Administrator): 4/5
Accuracy (Project_manager): 5/5
Accuracy (Database_Administrator): 3/5
Accuracy (Software_Developer): 4/5
Accuracy (Front_End_Developer): 5/5
Accuracy (Web_Developer): 5/5
Accuracy (Java_Developer): 3/5
Accuracy (Network_Administrator): 4/5
Accuracy (Python_Developer): 5/5
Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 5/5
Accuracy (Project_manager): 3/5
Accuracy (Database_Administrator): 4/5
Accuracy (Software_Developer): 3/5
Accuracy (Front_End_Developer): 3/5
Accuracy (Web_Developer): 4/5
Accuracy (Java_Developer): 5/5
Accuracy (Network_Administrator): 4/

Epoch 10: 100%|██████████| 99/99 [01:17<00:00,  1.29batch/s, loss=2.31e+3]
100%|██████████| 99/99 [01:17<00:00,  1.29it/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


Accuracy (Security_Analyst): 4/5
Accuracy (Systems_Administrator): 3/5
Accuracy (Project_manager): 5/5
Accuracy (Database_Administrator): 4/5
Accuracy (Software_Developer): 5/5
Accuracy (Front_End_Developer): 4/5
Accuracy (Web_Developer): 2/5
Accuracy (Java_Developer): 4/5
Accuracy (Network_Administrator): 4/5
Accuracy (Python_Developer): 3/5
Accuracy (Security_Analyst): 3/5
Accuracy (Systems_Administrator): 5/5
Accuracy (Project_manager): 3/5
Accuracy (Database_Administrator): 3/5
Accuracy (Software_Developer): 5/5
Accuracy (Front_End_Developer): 3/5
Accuracy (Web_Developer): 5/5
Accuracy (Java_Developer): 3/5
Accuracy (Network_Administrator): 5/5
Accuracy (Python_Developer): 3/5
Accuracy (Security_Analyst): 4/5
Accuracy (Systems_Administrator): 4/5
Accuracy (Project_manager): 5/5
Accuracy (Database_Administrator): 3/5
Accuracy (Software_Developer): 5/5
Accuracy (Front_End_Developer): 5/5
Accuracy (Web_Developer): 1/5
Accuracy (Java_Developer): 1/5
Accuracy (Network_Administrator): 2/

In [None]:
# Clear gpu
# del df_train
# del train_dataset
# del train_dataloader
# del optimizer
# del criterion
# del model
# del question_tokenizer
# del context_tokenizer
# del question_encoder
# del context_encoder
# clear_memory()


16.25
32.0




In [None]:
# 512 token max length
# 14 characters -> 3 tokens
# num characters: 510 * 14/3 = 2380
# num words: 366 to 476

# Test

In [None]:
# question_encoder = DPRQuestionEncoder.from_pretrained(
#     "facebook/dpr-question_encoder-single-nq-base"
# ).to(device)
# context_encoder = DPRContextEncoder.from_pretrained(
#     "facebook/dpr-ctx_encoder-single-nq-base"
# ).to(device)
# test(question_encoder, context_encoder)

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the

Python 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0]
Type 'copyright', 'credits' or 'license' for more information
IPython 8.20.0 -- An enhanced Interactive Python. Type '?' for help.

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[1], line 1
----> 1 question_inputs_ids

NameError: name 'question_inputs_ids' is not defined

Out[2]: 
tensor([[ 101, 9262, 1035, 9722,  102,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0