In [None]:
# Install Hugging Face Transformers library
!pip install transformers

# Install FAISS for efficient similarity search
!pip install faiss-cpu

# Install Torch if it's not already available
!pip install torch

# Install Tokenizers for efficient tokenization (optional, already included with Transformers)
!pip install tokenizers

# Install jsonlines for working with JSON data (optional)
!pip install jsonlines


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0
Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


import json
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
import torch

# Load the Corpus.json file
with open("/content/corpus.json", "r") as f:  # Specify the file path to Corpus.json
    corpus_json = json.load(f)

# Load pre-trained DPR Context Encoder
context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

# Preprocess corpus documents
corpus_embeddings = []
for doc in corpus_json:
    text = doc["body"]  # Extract article body
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    embeddings = context_encoder(**inputs).pooler_output
    corpus_embeddings.append(embeddings)
print(corpus_embeddings)


In [None]:
import json
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
import torch
import logging

# Set up logging
logging.basicConfig(level=logging.ERROR, filename='error.log')

try:
    # Load the Corpus.json file
    with open("/content/corpus.json", "r",encoding='utf-8') as f:  # Specify the file path to Corpus.json
        corpus_json = json.load(f)

    # Load pre-trained DPR Context Encoder
    context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
    tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

    # Preprocess corpus documents
    corpus_embeddings = []
    for doc in corpus_json:
        try:
            # Check if 'body' exists in the document
            if "body" not in doc:
                raise KeyError(f"'body' key not found in document: {doc}")

            text = doc["body"]  # Extract article body

            # Validate the text input
            if not isinstance(text, str) or len(text) == 0:
                raise ValueError("Article body must be a non-empty string.")

            # Tokenize the input text
            inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)

            # Generate embeddings
            embeddings = context_encoder(**inputs).pooler_output
            corpus_embeddings.append(embeddings)

        except Exception as e:
            logging.error(f"Error processing document {doc.get('title', 'Unknown')}: {e}")
            continue  # Skip to the next document if an error occurs

    print(f"Processed {len(corpus_embeddings)} documents successfully.")

except Exception as e:
    logging.error(f"An error occurred during preprocessing: {e}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


In [5]:
import json
import torch
import logging
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
import gc  # To manually free up memory

# Set up logging to catch errors
logging.basicConfig(level=logging.ERROR, filename='error.log')

# Load pre-trained DPR Context Encoder and Tokenizer
context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

# Adjust the batch size according to memory capacity (start small)
batch_size = 50

corpus_embeddings = []

# Batch processing function with memory management
def process_batch(batch):
    embeddings_batch = []
    for doc in batch:
        try:
            # Extract text from the document body
            text = doc.get("body", "")
            if not text:
                raise ValueError("Empty body")

            # Tokenize and encode the document
            inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
            with torch.no_grad():  # Avoid storing computation graphs
                embeddings = context_encoder(**inputs).pooler_output
            embeddings_batch.append(embeddings.cpu())  # Move to CPU and store
        except Exception as e:
            logging.error(f"Error processing document {doc.get('title', 'Unknown')}: {e}")
            continue
    return embeddings_batch

# Function to process the corpus from a large multiline JSON object
def process_corpus(file_path):
    with open(file_path, "r") as f:
        try:
            # Load the entire content of the file as a JSON object
            corpus_json = json.load(f)
        except Exception as e:
            logging.error(f"Error reading JSON file: {e}")
            return

    # Ensure the corpus is a list of documents
    if isinstance(corpus_json, list):
        num_docs = len(corpus_json)

        for i in range(0, num_docs, batch_size):
            # Process batch and release memory
            batch = corpus_json[i: i + batch_size]
            corpus_embeddings.extend(process_batch(batch))
            print(f"Processed {i + len(batch)} documents successfully.")
            del batch  # Free memory
            gc.collect()  # Garbage collection
    else:
        logging.error("The JSON file does not contain a list of documents.")

# Process the corpus
process_corpus("/content/corpus.json")

print(f"Total documents processed: {len(corpus_embeddings)}")


# Save the embeddings to a file
torch.save(corpus_embeddings, "corpus_embeddings.pt")
print(f"Total documents processed: {len(corpus_embeddings)}")



Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokeniz

KeyboardInterrupt: 

In [2]:
from sklearn.metrics.pairwise import cosine_similarity

import torch
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer

# Load pre-trained DPR Context Encoder and Tokenizer
context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

# Function to generate query embeddings
def generate_query_embedding(query_string):
    # Tokenize and encode the query
    inputs = tokenizer(query_string, return_tensors="pt", truncation=True, max_length=512)

    with torch.no_grad():  # Avoid storing computation graphs
        embeddings = context_encoder(**inputs).pooler_output

    return embeddings.cpu()  # Move to CPU and return

# Example usage
query = "Meta is to offer an ad-free subscription version of Facebook and Instagram in the European Union, EEA (European Economic Area) and Switzerland, confirming the core of a report in the WSJ earlier this month.when will it be avaliable?"
query_embedding = generate_query_embedding(query)
print("Query Embedding Shape:", query_embedding.shape)


def find_similar_documents(query_embedding, corpus_embeddings):
    similarities = cosine_similarity(query_embedding.cpu().numpy(), corpus_embeddings)
    # Get the index of the most similar document
    most_similar_idx = similarities.argmax()
    return most_similar_idx, similarities[most_similar_idx]


NameError: name '_C' is not defined

In [3]:
# Uninstall and Reinstall torch
!pip uninstall -y torch
!pip install torch torchvision torchaudio

Found existing installation: torch 2.4.1+cu121
Uninstalling torch-2.4.1+cu121:
  Successfully uninstalled torch-2.4.1+cu121
Collecting torch
  Downloading torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import torch
import json
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer, pipeline

# Load pre-trained DPR Context Encoder and Tokenizer
context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

# Load the saved embeddings
corpus_embeddings = torch.load("corpus_embeddings.pt")

# Load the corpus from the JSON file
with open("/content/corpus.json", "r", encoding='utf-8') as f:
    corpus_json = json.load(f)

# Function to generate query embeddings
def generate_query_embedding(query_string):
    inputs = tokenizer(query_string, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():  # Avoid storing computation graphs
        embeddings = context_encoder(**inputs).pooler_output
    return embeddings.cpu()  # Move to CPU and return

# Function to find similar documents
def find_similar_documents(query_string, corpus_embeddings, top_k=5):
    query_embedding = generate_query_embedding(query_string)
    similarities = cosine_similarity(query_embedding.cpu().numpy(), np.vstack(corpus_embeddings))
    top_indices = similarities[0].argsort()[-top_k:][::-1]
    similar_docs = [(idx, similarities[0][idx]) for idx in top_indices]
    return similar_docs


def generate_answer(query_string, article_body):
    # Load the question-answering pipeline with RoBERTa model fine-tuned on SQuAD2.0
    qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

    # Use the pipeline to get the answer
    result = qa_pipeline(question=query_string, context=article_body)

    # Return the answer and score
    return result['answer'], result['score']


# Function to extract facts from document bodies
def extract_fact_from_body(question, context):
    qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
    result = qa_pipeline(question=question, context=context)
    return result['answer']

# Function to format the results
def format_results(query_string, similar_documents, corpus_json):
    response = {
        "question": query_string,
        "answer": None,
        "question_type": "inference_query",
        "evidence_list": []
    }

    # Variable to hold the answer from the most similar document
    most_similar_answer = None
    highest_similarity_score = -1
    most_similar_document = None

    # Extract facts and populate evidence_list
    for idx, score in similar_documents:
        doc = corpus_json[idx]  # Get the document using its index
        body = doc.get("body", "")  # Extract body for fact extraction
        fact = extract_fact_from_body(query_string, body)  # Extract fact from the body

        evidence = {
            "title": doc.get("title", "No Title"),
            "author": doc.get("author", "Unknown Author"),
            "url": doc.get("url", "No URL"),
            "source": doc.get("source", "No Source"),
            "category": doc.get("category", "Uncategorized"),
            "published_at": doc.get("published_at", "No Date"),
            "fact": fact  # Use the extracted fact
        }

        response["evidence_list"].append(evidence)

    # Set the answer based on the most similar document
    if score > highest_similarity_score:
            most_similar_document = doc
            highest_similarity_score = score

    # Use the most similar document for the answer
    if most_similar_document:
        response["answer"] = most_similar_document.get("answer",generate_answer(query_string, most_similar_document.get("body","") ) )


    return response

# Example usage
query = "Meta is to offer an ad-free subscription version of Facebook and Instagram in the European Union, EEA (European Economic Area) and Switzerland, confirming the core of a report in the WSJ earlier this month. When will it be available?"
top_k = 5  # Number of top similar documents to retrieve
similar_documents = find_similar_documents(query, corpus_embeddings, top_k)

# Format the results
formatted_response = format_results(query, similar_documents, corpus_json)

# Print the structured response
print(json.dumps(formatted_response, indent=2))


Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokeniz

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

{
  "question": "Meta is to offer an ad-free subscription version of Facebook and Instagram in the European Union, EEA (European Economic Area) and Switzerland, confirming the core of a report in the WSJ earlier this month. When will it be available?",
  "answer": [
    "the AI Alliance",
    0.0030094829853624105
  ],
  "question_type": "inference_query",
  "evidence_list": [
    {
      "title": "Meta to offer ad-free subscription in Europe in bid to keep tracking other users",
      "author": "Natasha Lomas",
      "url": "https://techcrunch.com/2023/10/30/meta-ad-free-sub-eu/",
      "source": "TechCrunch",
      "category": "technology",
      "published_at": "2023-10-30T13:52:00+00:00",
      "fact": "November 2023"
    },
    {
      "title": "European consumer groups band together to fight Meta\u2019s self-serving ad-free sub \u2014 branding it \u2018unfair\u2019 and \u2018illegal\u2019",
      "author": "Natasha Lomas",
      "url": "https://techcrunch.com/2023/11/29/beuc-cpc-