***Installo i componenti che mi servono dopo prendendo le specifiche da setup.py***

In [1]:
! pip install hydra-core>=1.0.0

In [2]:
! pip install torch



In [3]:
! pip install regex
! pip install numpy
! pip install filelock



In [6]:
! pip install wget
! pip install jsonlines
! pip install editdistance



In [5]:
! pip install attrs



In [8]:
! pip install transformers>=4.3
! pip install tqdm>=4.27

In [9]:
! pip install spacy>=2.1.8
! pip install omegaconf>=2.0.1 

In [10]:
!  python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [11]:
! pip install MarkupSafe>=2.0.0

In [12]:
! python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


***Test generate embedding***

In [7]:
"""Generate dense embeddings. """
from hydra.errors import HydraException
import os 
import math 
import hydra
import pickle 
import pathlib
import logging 

import torch 
import torch.nn as nn 

from typing import List, Tuple 
from omegaconf import DictConfig, OmegaConf
from dpr.options import set_cfg_params_from_state, setup_cfg_gpu, setup_logger
from dpr.utils.data_utils import Tensorizer
from dpr.utils.model_utils import (
    setup_for_distributed_mode,
    get_model_obj,
    load_states_from_checkpoint,
    move_to_device,
)
from dpr.models import init_biencoder_components
from dpr.data.biencoder_data import BiEncoderTable
from dpr.data.table_data import prepare_table_ctx_inputs_batch


logger = logging.getLogger()
setup_logger(logger)


  from .autonotebook import tqdm as notebook_tqdm


In [8]:

def get_table_ctx_vectors(
    cfg: DictConfig, 
    ctx_rows: List[Tuple[object, BiEncoderTable]], 
    model: nn.Module, 
    tensorizer: Tensorizer, 
    insert_title: bool = True, 
): 
    """Encode table with context encoder under global/rowcol/auxemb settings."""
    n = len(ctx_rows) 
    bsz = cfg.batch_size 
    total = 0 
    results = [] 
    for j, batch_start in enumerate(range(0, n, bsz)): 
        batch = ctx_rows[batch_start: batch_start + bsz] 
        input_tensors = prepare_table_ctx_inputs_batch(
            batch, 
            tensorizer.tokenizer, 
            cfg.structure_option, 
            insert_title, 
            cfg.max_sequence_length, 
        )
        ctx_ids_batch = move_to_device(input_tensors['token_ids'], cfg.device)
        ctx_seg_batch = move_to_device(torch.zeros_like(ctx_ids_batch).long(), cfg.device)
        
        if cfg.structure_option == "rowcol": 
            ctx_attn_mask = move_to_device(input_tensors['attn_mask'], cfg.device) 
        else: 
            ctx_attn_mask = move_to_device(tensorizer.get_attn_mask(ctx_ids_batch), cfg.device)
        
        if cfg.structure_option == 'auxemb': 
            ctx_row_batch = move_to_device(input_tensors['row_ids'], cfg.device) 
            ctx_col_batch = move_to_device(input_tensors['column_ids'], cfg.device)
            with torch.no_grad(): 
                _, out, _ = model(
                    input_ids=ctx_ids_batch, 
                    token_type_ids=ctx_seg_batch, 
                    attention_mask=ctx_attn_mask, 
                    row_ids=ctx_row_batch, 
                    column_ids=ctx_col_batch, 
                )
        elif cfg.structure_option == "biased": 
            ctx_attn_mask = move_to_device(tensorizer.get_attn_mask(ctx_ids_batch), cfg.device)
            ctx_bias_mask_id = move_to_device(input_tensors['row_ids'], cfg.device) 
            ctx_col_batch = move_to_device(input_tensors['column_ids'], cfg.device)
            with torch.no_grad(): 
                _, out, _ = model(
                    input_ids=ctx_ids_batch, 
                    token_type_ids=ctx_seg_batch, 
                    attention_mask=ctx_attn_mask, 
                    row_ids=ctx_bias_mask_id, 
                    column_ids=ctx_col_batch, 
                )
        else: 
            with torch.no_grad(): 
                _, out, _ = model(
                    input_ids=ctx_ids_batch, 
                    token_type_ids=ctx_seg_batch, 
                    attention_mask=ctx_attn_mask, 
                )
        out = out.cpu() 
        
        ctx_ids= [r[0] for r in batch] 
        extra_info = [] 
        if len(batch[0]) > 3: extra_info = [r[3:] for r in batch]
        assert len(ctx_ids) == out.size(0)
        total += len(ctx_ids)
        
        if extra_info:
            results.extend([(ctx_ids[i], out[i].view(-1).numpy(), *extra_info[i]) for i in range(out.size(0))])
        else:
            results.extend([(ctx_ids[i], out[i].view(-1).numpy()) for i in range(out.size(0))])

        if total % 10 == 0: logger.info("Encoded passages %d", total)
            
    return results 
        


In [11]:
@hydra.main(config_path="conf", config_name="gen_embs")
def main(cfg: DictConfig):

    assert cfg.model_file, "Please specify encoder checkpoint as model_file param" #nel file gen_embs.yaml va valorizzato questo attributo con un trained bi-encoder checkpoint file per inizializzare il modello
    assert cfg.ctx_src, "Please specify passages source as ctx_src param" #di default è inizalizzato a nq_table

    #Configura parametri per CUDA,GPU e il training distribuito
    cfg = setup_cfg_gpu(cfg)

    #queste due funzioni caricano il checkpoint e settano i parametri che vengono utilizzati successivamente
    saved_state = load_states_from_checkpoint(cfg.model_file)
    set_cfg_params_from_state(saved_state.encoder_params, cfg)

    logger.info("CFG:")
    logger.info("%s", OmegaConf.to_yaml(cfg))
    
    #Valorizzo variabili e preparo modello bi-encoder
    tensorizer, encoder, _ = init_biencoder_components(cfg.encoder.encoder_model_type, cfg, inference_only=True)

    encoder = encoder.ctx_model if cfg.encoder_type == "ctx" else encoder.question_model

    #Configurazione modello pytorch
    encoder, _ = setup_for_distributed_mode(
        encoder, None, cfg.device, cfg.n_gpu,
        cfg.local_rank, cfg.fp16, cfg.fp16_opt_level,
    )
    encoder.eval()

    # load weights from the model file
    model_to_load = get_model_obj(encoder)
    logger.info("Loading saved model state ...")
    logger.debug("saved model keys =%s", saved_state.model_dict.keys())

    prefix_len = len("ctx_model.")
    ctx_state = {
        key[prefix_len:]: value for (key, value) in saved_state.model_dict.items() if key.startswith("ctx_model.")
    }
    model_to_load.load_state_dict(ctx_state, strict=False)

    # load from table data sources 
    logger.info("reading data source: %s", cfg.ctx_src)

    ctx_src = hydra.utils.instantiate(cfg.ctx_sources[cfg.ctx_src])
    all_passages_dict = {}
    ctx_src.load_data_to(all_passages_dict, cfg)
    all_passages = [(k, v) for k, v in all_passages_dict.items()]

    shard_size = math.ceil(len(all_passages) / cfg.num_shards)
    start_idx = cfg.shard_id * shard_size
    end_idx = start_idx + shard_size

    logger.info(
        "Producing encodings for passages range: %d to %d (out of total %d)",
        start_idx, end_idx, len(all_passages),
    )
    shard_passages = all_passages[start_idx:end_idx]
    
    data = get_table_ctx_vectors(cfg, shard_passages, encoder, tensorizer, insert_title=True)

    file = cfg.out_file + "_" + str(cfg.shard_id)
    pathlib.Path(os.path.dirname(file)).mkdir(parents=True, exist_ok=True)
    logger.info("Writing results to %s" % file)
    with open(file, mode="wb") as f:
        pickle.dump(data, f)

    logger.info("Total passages processed %d. Written to %s", len(data), file)



The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  @hydra.main(config_path="conf", config_name="gen_embs")


**Main pezzo per pezzo**

In [55]:
from omegaconf import OmegaConf
cfg = OmegaConf.load('conf/gen_embs.yaml')

In [56]:
cfg = setup_cfg_gpu(cfg)

[140617460925696] 2024-07-23 16:45:07,562 [INFO] root: CFG's local_rank=-1
[140617460925696] 2024-07-23 16:45:07,565 [INFO] root: Env WORLD_SIZE=None
[140617460925696] 2024-07-23 16:45:07,567 [INFO] root: Initialized host lggpu as d.rank -1 on device=cuda, n_gpu=1, world size=1
[140617460925696] 2024-07-23 16:45:07,568 [INFO] root: 16-bits training: False 


In [57]:
print(cfg.model_file)

/home/angelo/nqt-retrieval/nqt-retrieval/bert-base-encoder.cp


In [58]:
saved_state = load_states_from_checkpoint(cfg.model_file)


[140617460925696] 2024-07-23 16:45:10,250 [INFO] root: Reading saved model from /home/angelo/nqt-retrieval/nqt-retrieval/bert-base-encoder.cp
[140617460925696] 2024-07-23 16:45:10,819 [INFO] root: model_state_dict keys odict_keys(['model_dict', 'optimizer_dict', 'scheduler_dict', 'offset', 'epoch', 'encoder_params'])


In [59]:
set_cfg_params_from_state(saved_state.encoder_params, cfg)

In [60]:
logger.info("CFG:")
logger.info("%s", OmegaConf.to_yaml(cfg))

[140617460925696] 2024-07-23 16:45:13,399 [INFO] root: CFG:
[140617460925696] 2024-07-23 16:45:13,406 [INFO] root: defaults:
- encoder: hf_bert
- ctx_sources: table_sources
model_file: /home/angelo/nqt-retrieval/nqt-retrieval/bert-base-encoder.cp
ctx_src: nq_table
encoder_type: ctx
out_file: null
do_lower_case: true
shard_id: 0
num_shards: 1
batch_size: 32
tables_as_passages: false
special_tokens: null
tables_chunk_sz: 100
tables_split_type: type1
local_rank: -1
device: cuda
distributed_world_size: 1
distributed_port: null
no_cuda: false
n_gpu: 1
fp16: false
fp16_opt_level: O1
row_selection: none
max_cell_num: None
max_words: 120
max_words_per_header: 12
max_words_per_cell: 8
max_cell_num_per_row: 64
header_delimiter: '|'
cell_delimiter: '|'
row_delimiter: .
max_sequence_length: 256
structure_option: global



In [61]:
print(cfg.ctx_src)

nq_table


In [65]:
cfg.encoder = OmegaConf.load('conf/encoder/hf_bert.yaml')

In [66]:
print(cfg.encoder.encoder_model_type)

hf_bert


In [67]:
tensorizer, encoder, _ = init_biencoder_components(cfg.encoder.encoder_model_type, cfg, inference_only=True)

[140617460925696] 2024-07-23 16:45:55,592 [INFO] dpr.models.hf_models: Initializing HF BERT Encoder. cfg_name=bert-base-uncased
[140617460925696] 2024-07-23 16:46:34,165 [INFO] dpr.models.hf_models: Initializing HF BERT Encoder. cfg_name=bert-base-uncased


In [68]:
encoder = encoder.ctx_model if cfg.encoder_type == "ctx" else encoder.question_model

In [69]:
encoder, _ = setup_for_distributed_mode(
        encoder, None, cfg.device, cfg.n_gpu,
        cfg.local_rank, cfg.fp16, cfg.fp16_opt_level,
)
encoder.eval()

HFBertEncoder(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=F

In [70]:
# load weights from the model file
model_to_load = get_model_obj(encoder)
logger.info("Loading saved model state ...")
logger.debug("saved model keys =%s", saved_state.model_dict.keys())

prefix_len = len("ctx_model.")
ctx_state = {
    key[prefix_len:]: value for (key, value) in saved_state.model_dict.items() if key.startswith("ctx_model.")
}
model_to_load.load_state_dict(ctx_state, strict=False)

[140617460925696] 2024-07-23 16:51:21,402 [INFO] root: Loading saved model state ...


<All keys matched successfully>

In [72]:
logger.info("reading data source: %s", cfg.ctx_src)

[140617460925696] 2024-07-23 16:52:23,633 [INFO] root: reading data source: nq_table


In [None]:
ctx_src = hydra.utils.instantiate(cfg.ctx_sources[cfg.ctx_src])

In [None]:

all_passages_dict = {}
ctx_src.load_data_to(all_passages_dict, cfg)
all_passages = [(k, v) for k, v in all_passages_dict.items()]

shard_size = math.ceil(len(all_passages) / cfg.num_shards)
start_idx = cfg.shard_id * shard_size
end_idx = start_idx + shard_size