In [1]:
import torch

print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))
print(torch.cuda.get_device_name(1))

True
2
NVIDIA GeForce RTX 3080
NVIDIA GeForce RTX 3060


In [3]:
from utils import *

def analyze_json_responses(my_path):
    data = read_json(my_path)  # Legge il file JSON

    total_examples = len(data)  # Numero totale di esempi
    correct_with_context = 0  # Risposte corrette con risposta nel contesto
    correct_without_context = 0  # Risposte corrette senza risposta nel contesto
    has_answer_in_context = 0  # Esempi con risposta nel contesto
    no_answer_in_context = 0  # Esempi senza risposta nel contesto
    total_correct = 0  # Totale delle risposte corrette

    for entry in data:
        ans_in_documents = entry.get("ans_in_documents", False)
        ans_match_after_norm = entry.get("ans_match_after_norm", False)

        if ans_in_documents:
            has_answer_in_context += 1
            if ans_match_after_norm:
                correct_with_context += 1
        else:
            no_answer_in_context += 1
            if ans_match_after_norm:
                correct_without_context += 1

        # Conta ogni risposta corretta
        if ans_match_after_norm:
            total_correct += 1

    # Calcola le medie
    avg_correct_with_context = correct_with_context / has_answer_in_context if has_answer_in_context > 0 else 0
    avg_correct_without_context = correct_without_context / no_answer_in_context if no_answer_in_context > 0 else 0
    overall_accuracy = total_correct / total_examples if total_examples > 0 else 0

    return {
        "total_examples": total_examples,
        "examples_with_answer_in_context": has_answer_in_context,
        "examples_without_answer_in_context": no_answer_in_context,
        "correct_with_context": correct_with_context,
        "correct_without_context": correct_without_context,
        "average_correct_with_context": avg_correct_with_context,
        "average_correct_without_context": avg_correct_without_context,
        "overall_accuracy": overall_accuracy
    }


# Esempio di utilizzo
path = r'C:\Users\franc\Documents\Bridge_the_GAP\data\gen_res_example_llm\nq\gemma-2-2b-it\test\retrieved\contriever\1_doc\numdoc1_retr1_template_info_all_extended.json'
result = analyze_json_responses(path)
print(result)

{'total_examples': 2889, 'examples_with_answer_in_context': 721, 'examples_without_answer_in_context': 2168, 'correct_with_context': 539, 'correct_without_context': 87, 'average_correct_with_context': 0.7475728155339806, 'average_correct_without_context': 0.04012915129151291, 'overall_accuracy': 0.2166839736933195}


In [6]:
import os
import argparse
import warnings
import pandas as pd
import re
from tqdm import tqdm
from typing import Tuple, Dict, Optional

import torch
from torch.utils.data import DataLoader
from transformers import PreTrainedTokenizer

from utils import *
from bgm import BGM
from default_prompts import *
from prompt_dataset import PromptDataset

os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device(f"cuda:0" if torch.cuda.is_available() else "cpu")
warnings.filterwarnings('ignore')
SEED=10

info = {
    "nq_bgm": {
        "train": {
            "data_path": r'C:\Users\franc\Documents\Bridge_the_GAP\data\10k_train_dataset.json',
            "contriever_search_results_path": r"C:\Users\franc\Documents\Bridge_the_GAP\data\processed\contriever_search_results_at150.pkl",
        }
    },
}

def save_dataloader_to_json(dataloader, output_file, num_examples=15):
    all_batches = []

    print("Saving DataLoader contents to JSON...")
    for idx, batch in enumerate(dataloader):
        if idx >= num_examples:  # Stop after saving the specified number of examples
            break

        batch_dict = {}
        for key, value in batch.items():
            # Convert tensors to lists for JSON serialization
            if isinstance(value, torch.Tensor):
                batch_dict[key] = value.tolist()
            else:
                batch_dict[key] = value
        all_batches.append(batch_dict)
    
    # Save the entire list of dictionaries to a JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(all_batches, f, ensure_ascii=False, indent=4)

    print(f"DataLoader contents saved to {output_file}")

class DotDict:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

def parse_arguments(custom_args=None):
    """
    Mimics argparse to parse arguments for LLM generation. Accepts custom arguments as a dictionary for notebooks.
    """
    # Define default values
    default_args = {
        'output_dir': r'C:\Users\franc\Documents\Bridge_the_GAP\data\gen_id_document_bgm',
        'llm_id': 'google/flan-t5-large',
        'dataset': 'nq_bgm',
        'model_max_length': 4096,
        'quantization_bits': 4,
        'use_model_chat_template': False, 
        'gold_position': None,
        'num_retrieved_documents': 5,
        'use_test': False,
        'max_new_tokens': 50,
        'use_task_with_proof': False,
        'batch_size': None,
        'save_every': 250,
    }

    # If custom_args is provided, update defaults
    if custom_args:
        default_args.update(custom_args)

    # Perform validation
    if default_args['num_retrieved_documents'] is None:
        raise ValueError("'num_retrieved_documents' must be specified.")
    if default_args['num_retrieved_documents'] <= 0:
        raise ValueError("'num_retrieved_documents' must be a positive integer.")
    if default_args['gold_position'] is not None:
        if (default_args['gold_position'] < 0 or 
            default_args['gold_position'] >= default_args['num_retrieved_documents']):
            raise ValueError("'gold_position' must be within the range of 'num_retrieved_documents'.")

    return DotDict(**default_args)


def load_corpus(
    args: argparse.Namespace
) -> Tuple[List[Dict], Optional[Dict[int, int]]]:
    
    # Corpus with documents from Contriever
    corpus, full_to_subset_idx_map = read_corpus_with_contriever()

    return corpus, full_to_subset_idx_map

def load_search_results(args: argparse.Namespace) -> List[Tuple[List[int], List[float]]]:

    search_results_path = info[args.dataset][args.split]['contriever_search_results_path']
    retriever_search_results = read_pickle(search_results_path)

    return retriever_search_results


def get_prompt_template(args: argparse.Namespace):
    prompt_configuration = args.dataset
    if args.use_model_chat_template:
        chat_task_template_str = chat_task_templates[args.llm_id]['template']
        
        task_instruction = task_instructions[prompt_configuration]

        prompt_template = apply_chat_task_template(chat_task_template_str, task_instruction)
    else:
        task_template = task_templates[prompt_configuration]

        prompt_template = task_template.create_prompt_template()

    return prompt_template


def initialize_dataset_and_loader(
    args: argparse.Namespace, 
    corpus: List[Dict], 
    full_to_subset_idx_map: Optional[Dict[int, int]], 
    retriever_search_results: List[Tuple[List[int], List[float]]], 
    tokenizer: PreTrainedTokenizer
) -> DataLoader:
    
    prompt_template = get_prompt_template(args)
    
    prompt_ds = PromptDataset(
        corpus=corpus, data_path=info[args.dataset][args.split]['data_path'], 
        tokenizer=tokenizer, 
        max_tokenized_length=args.model_max_length - 2, 
        search_results=retriever_search_results,
        prompt_template=prompt_template,
        full_to_subset_idx_map=full_to_subset_idx_map,
        do_normalize_query=True, 
        num_documents_in_context=args.num_retrieved_documents,
        gold_position=args.gold_position, # None in these experiments
    )
        
    prompt_dataloader = DataLoader(
        prompt_ds,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=8,
        pin_memory=True,
    )
    return prompt_dataloader


def print_info(args: argparse.Namespace):
    print("INFO:")    
    print(f"DATA: {info[args.dataset][args.split]['data_path']}")
    print(f"USE TEST: {args.use_test}")
    print(f"MODEL: {args.llm_id}")
    print(f"MODEL MAX LENGTH: {args.model_max_length}")
    print(f'MAX NEW TOKENS: {args.max_new_tokens}')
    print(f"USE MODEL CHAT TEMPLATE: {args.use_model_chat_template}")
    print(f"TASK WITH PROOF:", args.use_task_with_proof)
    print(f"GOLD POSITION: {args.gold_position}")
    print(f"NUM DOCUMENTS IN CONTEXT: {args.num_retrieved_documents}")
    print(f"BATCH SIZE: {args.batch_size}")
    print(f"SAVE EVERY: {args.save_every}")


def extract_generate_answers(
    args: argparse.Namespace, 
    generated_output: List[str]
) -> List[str]:
    answer_prefix = "Answer:"
    if args.use_model_chat_template:
        answer_prefix = re.escape(chat_task_templates[args.llm_id]['answer_prefix'])

    generated_answers = []
    for output in generated_output:
        matches = list(re.finditer(answer_prefix, output))
        match_idx = 0

        # When using the proof there is a one-shot example that already 
        # contains the string "Answer:". Thus, we should get the second (match_idx=1) match.
        if args.use_model_chat_template and answer_prefix != "Answer:":
            match_idx = 0
 
        answer_end = matches[match_idx].end()
        response = output[answer_end:].strip()
        generated_answers.append(response)
    
    return generated_answers


def BGMTraining(
    args: argparse.Namespace, 
    prompt_ds: PromptDataset,
    llm: BGM, 
    prompt_dataloader: DataLoader
):
    # Info from arguments
    llm_id = args.llm_id
    num_doc = args.num_retrieved_documents
    save_every = args.save_every
    retriever_str = "contriever" 
    chat_template_str = "_template" if args.use_model_chat_template else ""
    prompt_type = "retrieved_proof" if args.use_task_with_proof else "retrieved"

    # Create the saving directory
    llm_folder = llm_id.split("/")[1] if '/' in llm_id else llm_id
    saving_dir = f"{args.output_dir}/{args.dataset}/{llm_folder}/{args.split}/{prompt_type}/{retriever_str}/{num_doc}_doc"
    os.makedirs(saving_dir, exist_ok=True)

    all_info = []  
    for idx, prompt_batch in enumerate(tqdm(prompt_dataloader)):
        prompts = prompt_batch['prompt']
        example_id = prompt_batch['example_id']
        prompts = prompt_batch['prompt']
        query = prompt_batch['query']
        document_indices=prompt_batch['document_indices']
        
        for doc_idx in document_indices:
            
            candidate_docs += doc_idx

            formatted_docs, _ = prompt_ds._get_documents_from_indices(candidate_docs)

            if '\nAnswer:' not in candidate_prompt:
                candidate_prompt += '\nAnswer:'

            

        generated_output = llm.generate(
            prompts, 
            max_new_tokens=args.max_new_tokens
        )

        generated_answers = extract_generate_answers(args, generated_output)
        prompt_batch['generated_answer'] = generated_answers
        
        all_info.append(prompt_batch)
        '''
        if (idx + 1) % save_every == 0 or (idx + 1) == len(prompt_dataloader):
            print(f"Saving at {idx + 1}...")
            file_name = f"{saving_dir}/numdoc{num_doc}_retr{args.num_retrieved_documents}{chat_template_str}_info_{idx+1}.pkl"
            write_pickle(all_info, file_name)
            all_info = []
        '''


def main():
    args = parse_arguments()

    args.split = "test" if args.use_test else "train"

    print("Loading LLM...")
    llm_id = args.llm_id
    bgm = BGM(
        llm_id, device,  
        model_max_length=args.model_max_length
    )
    tokenizer = bgm.tokenizer
    print("LLM loaded")


    print("Loading corpus and search results...")
    corpus, full_to_subset_idx_map = load_corpus(args)
    retriever_search_results = load_search_results(args)
    print("Corpus and search results loaded")


    print("Loading prompt dataset...")
    prompt_ds, prompt_dataloader = initialize_dataset_and_loader(
        args, corpus, full_to_subset_idx_map, 
        retriever_search_results, tokenizer
    )
    print("Prompt dataset loaded")

    print_info(args)

    #output_json_path = r'C:\Users\franc\Documents\Bridge_the_GAP\data\dataloader_contents.json'
    #save_dataloader_to_json(prompt_dataloader, output_json_path, num_examples=15)
        
    BGMTraining(args, prompt_ds, bgm, prompt_dataloader)



if __name__ == "__main__":
    seed_everything(SEED)
    main()

Loading LLM...
LLM loaded
Loading corpus and search results...
Corpus and search results loaded
Loading prompt dataset...
Prompt dataset loaded
INFO:
DATA: C:\Users\franc\Documents\Bridge_the_GAP\data\10k_train_dataset.json
USE TEST: False
MODEL: google/flan-t5-large
MODEL MAX LENGTH: 4096
MAX NEW TOKENS: 50
USE MODEL CHAT TEMPLATE: False
TASK WITH PROOF: False
GOLD POSITION: None
NUM DOCUMENTS IN CONTEXT: 3
BATCH SIZE: None
SAVE EVERY: 250
Saving DataLoader contents to JSON...
DataLoader contents saved to C:\Users\franc\Documents\Bridge_the_GAP\data\dataloader_contents.json


In [None]:
import json

def match_example_ids(file1_path, file2_path, output_path):
    """
    Modifica il file1 aggiungendo l'example_id da file2 quando query e question corrispondono.

    Args:
        file1_path (str): Percorso al file JSON di input 1.
        file2_path (str): Percorso al file JSON di input 2.
        output_path (str): Percorso al file JSON di output aggiornato.
    """
    try:
        # Caricamento dei file JSON
        with open(file1_path, 'r') as f1:
            file1 = json.load(f1)

        with open(file2_path, 'r') as f2:
            file2 = json.load(f2)

        # Creazione di un dizionario per mappare le domande agli example_id
        question_to_example_id = {item['question']: item['example_id'] for item in file2}

        # Modifica del primo file
        for entry in file1:
            query = entry.get('query')
            if query in question_to_example_id:
                entry['example_id'] = question_to_example_id[query]

        # Salvataggio del file aggiornato
        with open(output_path, 'w') as f1_updated:
            json.dump(file1, f1_updated, indent=4)

        print(f"File aggiornato salvato in: {output_path}")
    except FileNotFoundError as e:
        print(f"Errore: {e}")
    except json.JSONDecodeError as e:
        print(f"Errore nel parsing del file JSON: {e}")
    except Exception as e:
        print(f"Errore imprevisto: {e}")

def update_queries_with_document_indices(file1_path, file2_path, output_path):
    # Carica i dati dai file JSON
    with open(file1_path, 'r', encoding='utf-8') as f1, open(file2_path, 'r', encoding='utf-8') as f2:
        file1_data = json.load(f1)
        file2_data = json.load(f2)

    # Crea un dizionario per mappare le query ai document_indices di File 2
    query_to_indices = {
        entry['query']: entry.get('document_indices', [])
        for entry in file2_data
    }

    # Aggiorna File 1 aggiungendo i document_indices associati alle query
    for entry in file1_data:
        query = entry['query']
        if query in query_to_indices:
            entry['document_indices'] = query_to_indices[query]

    # Salva il risultato in un nuovo file JSON
    with open(output_path, 'w', encoding='utf-8') as output_file:
        json.dump(file1_data, output_file, indent=4, ensure_ascii=False)


path_output=r'C:\Users\franc\Documents\Bridge_the_GAP\data\gen_ids_document_training_set_bgm\nq_training\gemma-2-2b-it\train\retrieved\contriever\5_doc\numdoc5_retr5_template_info_all_extended_updated.json'
file_da_modificare = r'C:\Users\franc\Documents\Bridge_the_GAP\data\gen_ids_document_training_set_bgm\nq_training\gemma-2-2b-it\train\retrieved\contriever\5_doc\numdoc5_retr5_template_info_all_extended.json'
file_di_confronto = r'C:\Users\franc\Documents\Bridge_the_GAP\data\10k_train_dataset.json'

match_example_ids(file_da_modificare, file_di_confronto, path_output)

update_queries_with_document_indices(r'C:\Users\franc\Documents\Bridge_the_GAP\data\gen_ids_document_training_set_bgm\nq_training\gemma-2-2b-it\train\retrieved\contriever\5_doc\numdoc5_retr5_template_info_all_extended_updated.json', r'C:\Users\franc\Documents\Bridge_the_GAP\data\gen_ids_document_training_set_bgm\nq_training\gemma-2-2b-it\train\retrieved\contriever\5_doc\numdoc5_retr5_template_info_all_extended_updated.json', r'C:\Users\franc\Documents\Bridge_the_GAP\data\gen_ids_document_training_set_bgm\nq_training\gemma-2-2b-it\train\retrieved\contriever\5_doc\numdoc5_retr5_template_info_all_extended_updated_last.json')

File aggiornato salvato in: C:\Users\franc\Documents\Bridge_the_GAP\data\gen_ids_document_training_set_bgm\nq_training\gemma-2-2b-it\train\retrieved\contriever\5_doc\numdoc5_retr5_template_info_all_extended_updated.json


In [11]:
import json
import random

# Percentuali per ogni caso
percentages = {
    "case_1_single_doc": 0.1,
    "case_2_multiple_docs": 0.2,
    "case_3_no_docs": 0.1,
    "case_4_less_docs": 0.4,
    "case_5_reranking": 0.2,
}

# Task instruction da aggiungere a ogni query
task_instruction = "Output only the document IDs relevant to the query. Use this format: [ID1, ID2, ...]."

def process_data(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as f:
        examples = json.load(f)

    dataset = []
    
    # Shuffle examples to ensure random sampling
    random.shuffle(examples)

    # Total examples to be processed for each case
    total_examples = len(examples)
    case_limits = {case: int(total_examples * perc) for case, perc in percentages.items()}
    case_counters = {case: 0 for case in percentages}

    for example in examples:
        if all(count >= case_limits[case] for case, count in case_counters.items()):
            break  # Stop if all case limits are met
        
        query = f"Task Instruction: {task_instruction}\nQuestion:{example['query']}"  # Aggiunge la task instruction
        retrieved_docs = example["document_indices"]
        selected_docs = example["selected_documents"]
        are_answer = example["are_answer"]

        # Case 1: Single document correct answer
        if are_answer and len(selected_docs) == 1 and case_counters["case_1_single_doc"] < case_limits["case_1_single_doc"]:
            dataset.append({
                "input": {
                    "query": query,
                    "retrieved_docs": retrieved_docs,
                },
                "output": selected_docs,
            })
            case_counters["case_1_single_doc"] += 1

        # Case 2: Multiple documents correct answer
        elif are_answer and len(selected_docs) > 1 and case_counters["case_2_multiple_docs"] < case_limits["case_2_multiple_docs"]:
            dataset.append({
                "input": {
                    "query": query,
                    "retrieved_docs": retrieved_docs,
                },
                "output": selected_docs,
            })
            case_counters["case_2_multiple_docs"] += 1

        # Case 3: No documents correct answer
        elif are_answer and len(selected_docs) == 0 and case_counters["case_3_no_docs"] < case_limits["case_3_no_docs"]:
            dataset.append({
                "input": {
                    "query": query,
                    "retrieved_docs": [],
                },
                "output": [],
            })
            case_counters["case_3_no_docs"] += 1

        # Case 4: Input and output unchanged
        elif are_answer and len(selected_docs) > 2 and case_counters["case_4_less_docs"] < case_limits["case_4_less_docs"]:
            dataset.append({
                "input": {
                    "query": query,
                    "retrieved_docs": selected_docs,
                },
                "output": selected_docs,
            })
            case_counters["case_4_less_docs"] += 1

        # Case 5: Reranking
        elif are_answer and len(selected_docs) > 2 and case_counters["case_5_reranking"] < case_limits["case_5_reranking"]:
            reranked_docs = random.sample(selected_docs, len(selected_docs))  # Randomize order
            dataset.append({
                "input": {
                    "query": query,
                    "retrieved_docs": reranked_docs,
                },
                "output": selected_docs,
            })
            case_counters["case_5_reranking"] += 1

    # Save the dataset to a file
    with open(output_file, "w") as f:
        json.dump(dataset, f, indent=4)

# Path to input and output files
input_file = r'C:\Users\franc\Documents\Bridge_the_GAP\data\gen_ids_document_training_set_bgm\nq_training\gemma-2-2b-it\train\retrieved\contriever\5_doc\numdoc5_retr5_info_all_extended_training_set.json'
output_file = r'C:\Users\franc\Documents\Bridge_the_GAP\data\training_dataset.json'

process_data(input_file, output_file)

In [33]:
import json
import random

# Percentuali per ogni caso
percentages = {
    "case_1_single_doc": 0.07,
    "case_2_multiple_docs": 0.4,
    "case_3_no_docs": 0.1,
    "case_4_multi_doc_unchanged": 0.35,
    "case_5_reranking": 0.5,
    "case_6_single_doc_unchanged": 0.05,
}

# Task instruction da aggiungere a ogni query
task_instruction = "Output only the document IDs relevant to the query. Use this format: [ID1, ID2, ...]."

def process_data(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as f:
        examples = json.load(f)

    # Filtra gli esempi con are_answer = true
    valid_examples = [ex for ex in examples if ex["are_answer"] is True]

    print(f"Totale esempi nel file di input: {len(examples)}")
    print(f"Esempi con 'are_answer=True': {len(valid_examples)}")

    # Raggruppa per numero di selected_documents
    grouped_examples = {
        "len_0": [ex for ex in valid_examples if len(ex["selected_documents"]) == 0],
        "len_1": [ex for ex in valid_examples if len(ex["selected_documents"]) == 1],
        "len_gt_1": [ex for ex in valid_examples if len(ex["selected_documents"]) > 1],
    }

    print(f"Esempi con 'selected_documents == 0': {len(grouped_examples['len_0'])}")
    print(f"Esempi con 'selected_documents == 1': {len(grouped_examples['len_1'])}")
    print(f"Esempi con 'selected_documents > 1': {len(grouped_examples['len_gt_1'])}")

    # Calcola le suddivisioni per ogni gruppo
    group_case_limits = {
        "case_1_single_doc": int(len(grouped_examples["len_1"]) * percentages["case_1_single_doc"]),
        "case_2_multiple_docs": int(len(grouped_examples["len_gt_1"]) * percentages["case_2_multiple_docs"]),
        "case_3_no_docs": int(len(grouped_examples["len_0"]) * percentages["case_3_no_docs"]),
        "case_4_multi_doc_unchanged": int(len(grouped_examples["len_gt_1"]) * percentages["case_4_multi_doc_unchanged"]),
        "case_5_reranking": int(len(grouped_examples["len_gt_1"]) * percentages["case_5_reranking"]),
        "case_6_single_doc_unchanged": int(len(grouped_examples["len_1"]) * percentages["case_6_single_doc_unchanged"]),
    }

    print("Distribuzione pianificata degli esempi nel dataset creato:")
    for case, limit in group_case_limits.items():
        print(f"{case}: {limit}")

    dataset = []
    case_counters = {case: 0 for case in group_case_limits}

    # Processa gli esempi
    for example in valid_examples:
        query = f"Task Instruction: {task_instruction}\nQuestion:{example['query']}"
        retrieved_docs = example["document_indices"]
        selected_docs = example["selected_documents"]

        # Case 1: Single document correct answer
        if len(selected_docs) == 1 and case_counters["case_1_single_doc"] < group_case_limits["case_1_single_doc"]:
            dataset.append({
                "input": {
                    "query": query,
                    "retrieved_docs": retrieved_docs,
                },
                "output": selected_docs,
            })
            case_counters["case_1_single_doc"] += 1

        # Case 2: Multiple documents correct answer
        elif len(selected_docs) > 1 and case_counters["case_2_multiple_docs"] < group_case_limits["case_2_multiple_docs"]:
            dataset.append({
                "input": {
                    "query": query,
                    "retrieved_docs": retrieved_docs,
                },
                "output": selected_docs,
            })
            case_counters["case_2_multiple_docs"] += 1

        # Case 3: No documents correct answer
        elif len(selected_docs) == 0 and case_counters["case_3_no_docs"] < group_case_limits["case_3_no_docs"]:
            dataset.append({
                "input": {
                    "query": query,
                    "retrieved_docs": [],
                },
                "output": [],
            })
            case_counters["case_3_no_docs"] += 1

        # Case 4: Input and output unchanged for multiple docs
        elif len(selected_docs) > 1 and case_counters["case_4_multi_doc_unchanged"] < group_case_limits["case_4_multi_doc_unchanged"]:
            dataset.append({
                "input": {
                    "query": query,
                    "retrieved_docs": selected_docs,
                },
                "output": selected_docs,
            })
            case_counters["case_4_multi_doc_unchanged"] += 1

        # Case 6: Input and output unchanged for single doc
        elif len(selected_docs) == 1 and case_counters["case_6_single_doc_unchanged"] < group_case_limits["case_6_single_doc_unchanged"]:
            dataset.append({
                "input": {
                    "query": query,
                    "retrieved_docs": selected_docs,
                },
                "output": selected_docs,
            })
            case_counters["case_6_single_doc_unchanged"] += 1

        # Case 5: Reranking
        elif len(selected_docs) > 1 and case_counters["case_5_reranking"] < group_case_limits["case_5_reranking"]:
            reranked_docs = selected_docs[:]
            while reranked_docs == selected_docs:  # Garantisce che l'ordine sia diverso
                reranked_docs = random.sample(selected_docs, len(selected_docs))
            dataset.append({
                "input": {
                    "query": query,
                    "retrieved_docs": reranked_docs,
                },
                "output": selected_docs,
            })
            case_counters["case_5_reranking"] += 1

    print("Esempi effettivamente inclusi nel dataset creato:")
    tot=0
    for case, count in case_counters.items():
        tot += count
        print(f"{case}: {count}")

    print(f"Totale degli Esempi inclusi nel training dataset creato: {tot}")
    

    # Save the dataset to a file
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(dataset, f, indent=4)

# Path to input and output files
input_file = r'C:\Users\franc\Documents\Bridge_the_GAP\data\gen_ids_document_training_set_bgm\nq_training\gemma-2-2b-it\train\retrieved\contriever\5_doc\numdoc5_retr5_info_all_extended_training_set.json'
output_file = r'C:\Users\franc\Documents\Bridge_the_GAP\data\training_dataset.json'

process_data(input_file, output_file)

Totale esempi nel file di input: 3000
Esempi con 'are_answer=True': 1233
Esempi con 'selected_documents == 0': 366
Esempi con 'selected_documents == 1': 736
Esempi con 'selected_documents > 1': 131
Distribuzione pianificata degli esempi nel dataset creato:
case_1_single_doc: 51
case_2_multiple_docs: 52
case_3_no_docs: 36
case_4_multi_doc_unchanged: 45
case_5_reranking: 65
case_6_single_doc_unchanged: 36
Esempi effettivamente inclusi nel dataset creato:
case_1_single_doc: 51
case_2_multiple_docs: 52
case_3_no_docs: 36
case_4_multi_doc_unchanged: 45
case_5_reranking: 34
case_6_single_doc_unchanged: 36
Totale degli Esempi inclusi nel training dataset creato: 254


In [None]:
import os
import re 
import argparse
import warnings
from tqdm import tqdm
from typing import Tuple, Dict, Optional

import torch
from torch.utils.data import DataLoader
from transformers import PreTrainedTokenizer

from utils import *
from bgm import BGM
from default_prompts import *
from prompt_dataset import PromptDataset

os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device(f"cuda:0" if torch.cuda.is_available() else "cpu")
warnings.filterwarnings('ignore')
SEED=10

info = {
    "nq": {
        "test": {
            "data_path": r'C:\Users\franc\Documents\Bridge_the_GAP\data\test_dataset.json',
            "contriever_search_results_path": r"C:\Users\franc\Documents\Bridge_the_GAP\data\processed\contriever_test_search_results_at150.pkl",
        }
    },
}

class DotDict:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

def parse_arguments(custom_args=None):
    """
    Mimics argparse to parse arguments for LLM generation. Accepts custom arguments as a dictionary for notebooks.
    """
    # Define default values
    default_args = {
        'output_dir': r'C:\Users\franc\Documents\Bridge_the_GAP\data\gen_id_res_example_bgm',
        'llm_id': 'google-t5/t5-base',
        'dataset': 'nq',
        'model_max_length': 4096,
        'quantization_bits': 4,
        'gold_position': None,
        'use_model_chat_template': False, 
        'num_retrieved_documents': 5,
        'use_test': True,
        'padding_strategy': 'longest',
        'max_new_tokens': 15,
        'use_task_with_proof': False,
        'batch_size': None,
        'save_every': 250,
    }

    # If custom_args is provided, update defaults
    if custom_args:
        default_args.update(custom_args)

    # Perform validation
    if default_args['num_retrieved_documents'] is None:
        raise ValueError("'num_retrieved_documents' must be specified.")
    if default_args['num_retrieved_documents'] <= 0:
        raise ValueError("'num_retrieved_documents' must be a positive integer.")
    if default_args['gold_position'] is not None:
        if (default_args['gold_position'] < 0 or 
            default_args['gold_position'] >= default_args['num_retrieved_documents']):
            raise ValueError("'gold_position' must be within the range of 'num_retrieved_documents'.")

    return DotDict(**default_args)


def load_corpus(
    args: argparse.Namespace
) -> Tuple[List[Dict], Optional[Dict[int, int]]]:
    
    # Corpus with documents from Contriever
    corpus, full_to_subset_idx_map = read_test_corpus_with_random_and_contriever()

    return corpus, full_to_subset_idx_map

def load_search_results(args: argparse.Namespace) -> List[Tuple[List[int], List[float]]]:

    search_results_path = info[args.dataset][args.split]['contriever_search_results_path']
    retriever_search_results = read_pickle(search_results_path)

    return retriever_search_results


def get_prompt_template(args: argparse.Namespace):
    prompt_configuration = args.dataset
    if args.use_model_chat_template:
        chat_task_template_str = chat_task_templates[args.llm_id]['template']
        
        task_instruction = task_instructions[prompt_configuration]

        prompt_template = apply_chat_task_template(chat_task_template_str, task_instruction)
    else:
        task_template = task_templates[prompt_configuration]

        prompt_template = task_template.create_prompt_template()

    return prompt_template


def initialize_dataset_and_loader(
    args: argparse.Namespace, 
    corpus: List[Dict], 
    full_to_subset_idx_map: Optional[Dict[int, int]], 
    retriever_search_results: List[Tuple[List[int], List[float]]], 
    tokenizer: PreTrainedTokenizer
) -> DataLoader:
    
    prompt_template = get_prompt_template(args)
    
    prompt_ds = PromptDataset(
        corpus=corpus, data_path=info[args.dataset][args.split]['data_path'], 
        tokenizer=tokenizer, 
        max_tokenized_length=args.model_max_length - 2, 
        search_results=retriever_search_results,
        prompt_template=prompt_template,
        full_to_subset_idx_map=full_to_subset_idx_map,
        do_normalize_query=True, 
        num_documents_in_context=args.num_retrieved_documents,
        gold_position=args.gold_position, # None in these experiments
    )
        
    prompt_dataloader = DataLoader(
        prompt_ds,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=8,
        pin_memory=True,
    )
    return prompt_dataloader


from typing import List, Tuple, Optional, Union

from typing import List, Tuple, Optional, Union
import re

def check_document_has_answer(
    generated_id: Union[str, List[Union[str, int]]],
    answers: List[str],
    prompt: str
) -> Tuple[Union[bool, str], Optional[str], Optional[str]]:
    """
    Verifica se il documento corrispondente all'ID generato o tutti i documenti nel prompt contengono una risposta,
    gestendo casi in cui l'ID generato può essere una stringa, una lista o nullo.

    Args:
        generated_id (Union[str, List[Union[str, int]]]): ID generato (stringa, lista di interi/stringhe, o nullo).
        answers (List[str]): Lista delle risposte da confrontare.
        prompt (str): Testo completo del prompt con i documenti.

    Returns:
        Tuple[Union[bool, str], Optional[str], Optional[str]]: Una tupla con:
            - Un booleano (True) se è stata trovata una risposta,
              o una stringa descrittiva se nessuna risposta è stata trovata.
            - La risposta trovata, se presente (altrimenti None).
            - Il testo del documento corrispondente, se esiste (altrimenti None).
    """
    # Usa regex per estrarre i documenti dal prompt
    documents = re.findall(r"Document \[(\d+)\]\(.*?\):?\s(.*?)(?=Document \[\d+\]|$)", prompt, re.DOTALL)
    documents_dict = {int(doc_id): text.strip() for doc_id, text in documents}

    # Funzione per controllare un singolo ID
    def check_single_id(id_doc):
        if id_doc in documents_dict:
            document_text = documents_dict[id_doc]
            for answer in answers:
                if answer.lower() in document_text.lower():
                    return True, answer, document_text
            return "Nessuna risposta trovata nel documento specificato.", None, document_text
        return "Documento con l'ID specificato non trovato.", None, None

    # Caso 1: ID generato è una stringa nulla o contiene solo spazi
    if isinstance(generated_id, str) and not generated_id.strip():

        for doc_id, document_text in documents_dict.items():
            for answer in answers:
                if answer.lower() in document_text.lower():
                    return True, answer, document_text

        return "Nessun documento contiene le risposte.", None, None

    # Caso 2: ID generato è una lista
    if isinstance(generated_id, list):
        for id_item in generated_id:
            id_doc = int(id_item)
            result = check_single_id(id_doc)
            if result[0] == True:
                return result
        return "Nessun documento contiene le risposte per gli ID nella lista.", None, None

    # Caso 3: ID generato è una stringa o un intero singolo
    try:
        id_doc = int(generated_id)
        return check_single_id(id_doc)
    except ValueError:
        return "L'ID generato non è valido.", None, None
    

def print_info(args: argparse.Namespace):
    print("INFO:")    
    print(f"DATA: {info[args.dataset]['test']['data_path']}")
    print(f"USE TEST: {args.use_test}")
    print(f"MODEL: {args.llm_id}")
    print(f"MODEL MAX LENGTH: {args.model_max_length}")
    print(f'MAX NEW TOKENS: {args.max_new_tokens}')
    print(f"USE MODEL CHAT TEMPLATE: {args.use_model_chat_template}")
    print(f"TASK WITH PROOF:", args.use_task_with_proof)
    print(f"GOLD POSITION: {args.gold_position}")
    print(f"NUM DOCUMENTS IN CONTEXT: {args.num_retrieved_documents}")
    print(f"BATCH SIZE: {args.batch_size}")
    print(f"SAVE EVERY: {args.save_every}")


def generate_and_save(
    args: argparse.Namespace, 
    bgm: BGM, 
    prompt_dataloader: DataLoader
):
    # Info from arguments
    llm_id = args.llm_id
    num_doc = args.num_retrieved_documents
    save_every = args.save_every
    retriever_str = "contriever"
    padding_str = f"_{args.padding_strategy}{args.model_max_length}" if args.padding_strategy != "longest" else "" 
    chat_template_str = "_template" if args.use_model_chat_template else ""
    prompt_type = "retrieved_proof" if args.use_task_with_proof else "retrieved"

    # Create the saving directory
    llm_folder = llm_id.split("/")[1] if '/' in llm_id else llm_id
    saving_dir = f"{args.output_dir}/{args.dataset}/{llm_folder}/{args.split}/{prompt_type}/{retriever_str}/{num_doc}_doc"
    os.makedirs(saving_dir, exist_ok=True)

    # Path del file .json
    json_file_path = os.path.join(saving_dir, "generated_results_weights_epoch_25.json")

    all_info = []  
    for idx, prompt_batch in enumerate(tqdm(prompt_dataloader)):
        if idx == 100:
            break

        prompts = prompt_batch['prompt']
        answers = prompt_batch['answers']
        document_indices=prompt_batch['document_indices']

        # Usa una regex per estrarre tutto a partire da "Question"
        match = re.search(r"Question:.*", prompts, re.DOTALL)

        # Controlla se c'è una corrispondenza e prendi il risultato
        if match:
            prompts = match.group()
        else:
            print("Nessuna corrispondenza trovata.")

        generated_output = bgm.generate(
            prompts, 
            padding_strategy=args.padding_strategy,
            max_new_tokens=args.max_new_tokens
        )

        has_answer, answer_found, document_found = check_document_has_answer(generated_output, answers, prompts)


        # Salva i risultati in un dizionario
        result = {
            "prompt": prompts,
            "all_document_indices": document_indices,
            "generated_indices": generated_output,
            "id_document": document_found,
            "generated_id_document_has_answer": has_answer,
            "answer_in_the_document": answer_found,
            "answers_target": answers
        }
        all_info.append(result)
        
        #print(f"Esempio {idx+1}\n")
        #print(f"I migliori indici secondo il modello: {generated_output}\n")
        #print(f"Gli indici target sono: {prompt_batch['document_indices']}")

    # Scrivi i risultati nel file JSON
    with open(json_file_path, "w", encoding="utf-8") as json_file:
        json.dump(all_info, json_file, indent=4, ensure_ascii=False)

    print(f"Risultati salvati in: {json_file_path}")


def main():
    args = parse_arguments()

    args.split = "test" if args.use_test else "train"

    print("Loading LLM...")
    llm_id = args.llm_id

    saved_model_path = r"C:\Users\franc\Documents\Bridge_the_GAP\data\lora_training_bgm\google-t5-base\lora-checkpoint\epochs\epoch_25"
    
    bgm = BGM(
        llm_id, device, 
        quantization_bits=args.quantization_bits, 
        model_max_length=args.model_max_length,
        lora_weights_path=saved_model_path
    )
    tokenizer = bgm.tokenizer
    print("LLM loaded")


    print("Loading corpus and search results...")
    corpus, full_to_subset_idx_map = load_corpus(args)
    retriever_search_results = load_search_results(args)
    print("Corpus and search results loaded")


    print("Loading prompt dataset...")
    prompt_dataloader = initialize_dataset_and_loader(
        args, corpus, full_to_subset_idx_map, 
        retriever_search_results, tokenizer
    )
    print("Prompt dataset loaded")

    print_info(args)

    #for i in range(5):
        #entry = prompt_dataloader.dataset[i]
        #print(f"{entry}")

    generate_and_save(args, bgm, prompt_dataloader)



if __name__ == "__main__":
    seed_everything(SEED)
    main()

Loading LLM...
LoRA weights loaded from: C:\Users\franc\Documents\Bridge_the_GAP\data\lora_training_bgm\lora-checkpoint\epochs\epoch_25
LLM loaded
Loading corpus and search results...
Corpus and search results loaded
Loading prompt dataset...
Prompt dataset loaded
INFO:
DATA: C:\Users\franc\Documents\Bridge_the_GAP\data\test_dataset.json
USE TEST: True
MODEL: google-t5/t5-base
MODEL MAX LENGTH: 4096
MAX NEW TOKENS: 15
USE MODEL CHAT TEMPLATE: False
TASK WITH PROOF: False
GOLD POSITION: None
NUM DOCUMENTS IN CONTEXT: 5
BATCH SIZE: None
SAVE EVERY: 250


  0%|          | 6/2889 [00:54<2:49:06,  3.52s/it] 

L'ID generato è vuoto o contiene solo spazi. Controllo tutti i documenti...
Nessuna risposta trovata in nessun documento.
L'ID generato 'Id_5' non è valido.


  0%|          | 8/2889 [00:54<1:38:11,  2.04s/it]

L'ID generato 'Version 9' non è valido.
L'ID generato è vuoto o contiene solo spazi. Controllo tutti i documenti...
Risposta trovata nel documento ID titles. In 2014 the Little Princess Ballet Academy (LPBA) performed the entire Swan Lake in Second Life. The adaption follows the original, but some parts like the pas de deux were not possible to perform in Second Life and has been changed. All parts are played by individual avatars. Audio Video Swan Lake Swan Lake ( ""), Op. 20, is a ballet composed by Pyotr Ilyich Tchaikovsky in 1875–76. Despite its initial failure, it is now one of the most popular of all ballets. The scenario, initially in two acts, was fashioned from Russian and/or German folk tales and tells.


  0%|          | 12/2889 [00:54<42:09,  1.14it/s]  

L'ID generato è vuoto o contiene solo spazi. Controllo tutti i documenti...
Nessuna risposta trovata in nessun documento.
L'ID generato 'Barry Parker' non è valido.


  0%|          | 13/2889 [00:54<35:23,  1.35it/s]

L'ID generato 'the PAX6' non è valido.
L'ID generato è vuoto o contiene solo spazi. Controllo tutti i documenti...
Risposta trovata nel documento ID additional land grants, this time to former soldiers, which included parts of Oak Island. It wasn't until July 6, 1818 that the original lot owners' names were mapped for the Nova Scotia Crown Lands office. Over the next 140 years or so, the island was owned by various treasure hunters who sought a legendary treasure buried somewhere on Oak Island (See section below). The hunt for treasure got so extensive that in 1965 a causeway was built from the western end of the island to Crandall's Point on the mainland, two hundred metres away in order to bring heavy machinery.


  1%|          | 16/2889 [00:55<20:52,  2.29it/s]

L'ID generato 'Id_3' non è valido.
L'ID generato 'olivine' non è valido.


  1%|          | 17/2889 [00:55<17:59,  2.66it/s]

L'ID generato 'Id_3' non è valido.


  1%|          | 20/2889 [00:55<11:44,  4.07it/s]

L'ID generato 'the posterior' non è valido.


  1%|          | 22/2889 [00:55<07:42,  6.20it/s]

L'ID generato è vuoto o contiene solo spazi. Controllo tutti i documenti...
Risposta trovata nel documento ID as cardiothoracic surgeon Erica Hahn, whose storylines include the rivalry with Preston Burke, her arrival to perform surgery of George O'Malley's dying father, and Richard Webber's decision to hire her in the hospital. Chyler Leigh portrayed Meredith's half-sister, Lexie Grey, who is accepted into the hospital's internship program after her mother's sudden death. Kate Burton appeared as Meredith Grey's mother, Ellis Grey, a renowned surgeon suffering from Alzheimer's disease, who ultimately dies following a heart attack. Veterinary physician Finn Dandrige was portrayed by Chris O'Donnell and appeared in the first four episodes of the season to resume the storyline of.
L'ID generato è vuoto o contiene solo spazi. Controllo tutti i documenti...
Risposta trovata nel documento ID As a result, in 1603, King James I approved an Act of Parliament banning the practice by which "the Su

  1%|          | 26/2889 [00:56<06:55,  6.90it/s]

L'ID generato è vuoto o contiene solo spazi. Controllo tutti i documenti...
Risposta trovata nel documento ID had the highest quality pearls. The boundaries of South Asia vary based on how the region is defined. South Asia's northern, eastern, and western boundaries vary based on definitions used, while the Indian Ocean is the southern periphery. Most of this region rests on the Indian Plate and is isolated from the rest of Asia by mountain barriers. Much of the region consists of a peninsula in south-central Asia, rather resembling a diamond which is delineated by the Himalayas on the north, the Hindu Kush in the west, and the Arakanese in the east, and which extends southward into the
Answer:.


  1%|          | 28/2889 [00:56<07:19,  6.51it/s]

L'ID generato 'City of Manchester' non è valido.
L'ID generato 'Id_1' non è valido.


  1%|          | 31/2889 [00:57<07:04,  6.73it/s]

L'ID generato 'One' non è valido.
L'ID generato 'Id_2' non è valido.


  1%|          | 33/2889 [00:57<06:56,  6.85it/s]

L'ID generato 'Id_2' non è valido.
L'ID generato '28%' non è valido.


  1%|          | 34/2889 [00:57<09:49,  4.84it/s]

L'ID generato 'I am not in a box of any description' non è valido.
L'ID generato è vuoto o contiene solo spazi. Controllo tutti i documenti...
Risposta trovata nel documento ID Louis Rams. That year, the tease was followed by the show open produced by Los Angeles-based The Syndicate called "Transformation". It features computer-generated imagery showing a city being transformed into a football stadium and passers-by on the street turning into players, coaches, fans and officials set to an updated orchestral treatment of the "Heavy Action" theme song. The sequence began every week with a different celebrity walking down the street, picking up a glowing football helmet with the ESPN logo on the side and saying, "I'm ready for some football! Are you?", thus beginning the transformation process. Celebrities for 2006.


  1%|▏         | 39/2889 [00:58<07:22,  6.44it/s]

L'ID generato 'ITV' non è valido.
L'ID generato è vuoto o contiene solo spazi. Controllo tutti i documenti...
Nessuna risposta trovata in nessun documento.
L'ID generato 'Ivan' non è valido.


  1%|▏         | 43/2889 [00:59<06:50,  6.94it/s]

L'ID generato 'the Parliament' non è valido.


  2%|▏         | 45/2889 [00:59<05:52,  8.08it/s]

L'ID generato è vuoto o contiene solo spazi. Controllo tutti i documenti...
Risposta trovata nel documento ID layer, known as the fibrous tunic, is composed of the cornea and sclera. The middle layer, known as the vascular tunic or uvea, consists of the choroid, ciliary body, pigmented epithelium and iris. The innermost is the retina, which gets its oxygenation from the blood vessels of the choroid (posteriorly) as well as the retinal vessels (anteriorly). The spaces of the eye are filled with the aqueous humour anteriorly, between the cornea and lens, and the vitreous body, a jelly-like substance, behind the lens, filling the entire posterior cavity. The aqueous humour is a clear watery fluid that is contained
Answer:.


  2%|▏         | 47/2889 [00:59<07:11,  6.58it/s]

L'ID generato 'the "Four Year War"' non è valido.
L'ID generato 'Bernard Tomic' non è valido.
L'ID generato è vuoto o contiene solo spazi. Controllo tutti i documenti...
Risposta trovata nel documento ID 2017, Brazil returned to the No. 1 spot for the first time since just prior to the 2010 World Cup, but Germany regained the top spot in July after winning the Confederations Cup. In August 2018, France became the leader in the FIFA rankings again after nearly 16 years, having won the 2018 FIFA World Cup, and this is also the first time FIFA adopted the Elo rating system to the ranking system. One month later, for the first time two teams were tied at the top spot as Belgium returned to the number one spot with the same ranking
Answer:.


  2%|▏         | 49/2889 [00:59<06:05,  7.78it/s]

L'ID generato 'the 16th' non è valido.
L'ID generato è vuoto o contiene solo spazi. Controllo tutti i documenti...
Risposta trovata nel documento ID The Glory of Love (song) "The Glory of Love" is a song written by Billy Hill, recorded by Benny Goodman in 1936, whose version was a number one pop hit. Subsequently, the song has been recorded by a vast number of artists, ranging from Dean Martin to Jimmy Durante to Paul McCartney. Bette Midler included the song in her film "Beaches" (1988) and it appears in the soundtrack recording. In 1951, R&B vocal group, The Five Keys, had their biggest R&B hit with their version of the song, hitting number one on the R&B chart for four non-consecutive weeks. Although
Answer:.


  2%|▏         | 51/2889 [01:00<05:46,  8.19it/s]

L'ID generato 'Buffon' non è valido.


  2%|▏         | 53/2889 [01:00<06:50,  6.92it/s]

L'ID generato 'Eiichi Ohtaki' non è valido.
L'ID generato 'IOC' non è valido.


  2%|▏         | 55/2889 [01:00<07:11,  6.56it/s]

L'ID generato 'January 8, 2018' non è valido.
L'ID generato 'Id_5' non è valido.


  2%|▏         | 57/2889 [01:01<06:37,  7.13it/s]

L'ID generato 'George Berkeley' non è valido.
L'ID generato 'Uralic' non è valido.


  2%|▏         | 59/2889 [01:01<05:53,  8.00it/s]

L'ID generato 'larger' non è valido.


  2%|▏         | 63/2889 [01:01<05:19,  8.86it/s]

L'ID generato 'Phelps' non è valido.
L'ID generato è vuoto o contiene solo spazi. Controllo tutti i documenti...
Risposta trovata nel documento ID "OED" mentions of its meaning "a liquid for drinking" occurred in the 14th century. Its use as a term for "an intoxicating alcoholic drink" appeared in the 16th century. The term "spirit" in reference to alcohol stems from Middle Eastern alchemy. These alchemists were more concerned with medical elixirs than with transmuting lead into gold. The vapor given off and collected during an alchemical process (as with distillation of alcohol) was called a spirit of the original material. Early evidence of distillation comes from Akkadian tablets dated "circa" 1200 BC describing perfumery operations, providing textual evidence that an early primitive
Answer:.
L'ID generato 'John Brown' non è valido.


  2%|▏         | 65/2889 [01:02<06:11,  7.60it/s]

L'ID generato 'April 22, 1998' non è valido.


  2%|▏         | 68/2889 [01:02<05:22,  8.75it/s]

L'ID generato è vuoto o contiene solo spazi. Controllo tutti i documenti...
Risposta trovata nel documento ID of the same row also becomes calcified, and transverse bars of calcified substance stretch across from one calcareous column to another. Thus there are longitudinal groups of the cartilage cells enclosed in oblong cavities, the walls of which are formed of calcified matrix which cuts off all nutrition from the cells; the cells, in consequence, atrophy, leaving spaces called the primary areolæ. There are two types of ossification centers – primary and secondary. A primary ossification center is the first area of a bone to start ossifying. It usually appears during prenatal development in the central part of each developing
Answer:.


  2%|▏         | 70/2889 [01:02<06:05,  7.72it/s]

L'ID generato 'March 8, 2018' non è valido.


  2%|▏         | 71/2889 [01:02<06:36,  7.11it/s]

L'ID generato 'Id_2' non è valido.


  2%|▏         | 72/2889 [01:03<07:43,  6.08it/s]

L'ID generato 'Hasse Olsson' non è valido.
L'ID generato è vuoto o contiene solo spazi. Controllo tutti i documenti...
Risposta trovata nel documento ID edge has two vertices (which may coincide) as endpoints. That is, we allow multiple edges (edges with the same pair of endpoints) and loops (edges whose two endpoints are the same vertex). A subgraph of a graph is the graph formed by any subsets of its vertices and edges such that each edge in the edge subset has both endpoints in the vertex subset. A connected component of an undirected graph is the subgraph consisting of the vertices and edges that can be reached by following edges from a single given starting vertex. A graph is connected if every vertex.


  3%|▎         | 75/2889 [01:03<06:46,  6.92it/s]

L'ID generato 'EPISODE 108' non è valido.
L'ID generato 'IGN' non è valido.
L'ID generato è vuoto o contiene solo spazi. Controllo tutti i documenti...
Nessuna risposta trovata in nessun documento.


  3%|▎         | 77/2889 [01:03<05:03,  9.26it/s]

L'ID generato è vuoto o contiene solo spazi. Controllo tutti i documenti...
Risposta trovata nel documento ID Supporting Actress (Beatrice Straight), and Best Screenplay (Paddy Chayefsky). Thanks to a stellar cast, experienced director, and a poignant story, "Network" became one of the largest critical successes of 1976. Another film, "Rocky", about a clubhouse boxer (played by Sylvester Stallone) who is granted a world championship title fight won the Best Picture Academy Award that year. The film also became a major commercial success and spawned four sequels through the rest of the 1970s and 1980s. Throughout the 1970s, the horror film developed into a lucrative genre of film. It began in 1973 with the terrifying "The Exorcist", directed.


  3%|▎         | 82/2889 [01:04<07:59,  5.85it/s]

L'ID generato 'XXXIX' non è valido.
L'ID generato 'the Lord's sign' non è valido.


  3%|▎         | 86/2889 [01:05<08:07,  5.75it/s]

L'ID generato '"Sacrifice"' non è valido.
L'ID generato '16th' non è valido.
L'ID generato è vuoto o contiene solo spazi. Controllo tutti i documenti...
Nessuna risposta trovata in nessun documento.


  3%|▎         | 89/2889 [01:05<06:30,  7.18it/s]

L'ID generato 'IOC' non è valido.


  3%|▎         | 92/2889 [01:06<10:06,  4.61it/s]

L'ID generato 'I Didn't Leave the Democrats. They Left me' non è valido.
L'ID generato 'Dexter Wansel' non è valido.


  3%|▎         | 93/2889 [01:06<10:12,  4.56it/s]

L'ID generato 'Id_2' non è valido.


  3%|▎         | 96/2889 [01:07<08:39,  5.38it/s]

L'ID generato 'March 20, 1852' non è valido.
L'ID generato 'Armstrong' non è valido.


  3%|▎         | 98/2889 [01:07<06:13,  7.46it/s]

L'ID generato è vuoto o contiene solo spazi. Controllo tutti i documenti...
Risposta trovata nel documento ID chain as a source of energy. The overall process of creating energy in this fashion is termed oxidative phosphorylation. The same process takes place in the mitochondria, where ATP synthase is located in the inner mitochondrial membrane and the F-part projects into mitochondrial matrix. The consumption of ATP by ATP-synthase pumps proton cations into the matrix. The evolution of ATP synthase is thought to have been modular whereby two functionally independent subunits became associated and gained new functionality. This association appears to have occurred early in evolutionary history, because essentially the same structure and activity of ATP synthase enzymes are.
L'ID generato è vuoto o contiene solo spazi. Controllo tutti i documenti...
Risposta trovata nel documento ID What a Friend We Have in Jesus "What a Friend We Have in Jesus" is a Christian hymn originally written by

  3%|▎         | 100/2889 [01:09<32:10,  1.44it/s]


Risultati salvati in: C:\Users\franc\Documents\Bridge_the_GAP\data\gen_id_res_example_bgm/nq/t5-base/test/retrieved/contriever/5_doc\generated_results_weights_epoch_25.json
