In [1]:
pip install transformers datasets faiss-cpu torch sentence-transformers pymupdf


Defaulting to user installation because normal site-packages is not writeable
^C
[31mERROR: Operation cancelled by user[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import torch
if torch.cuda.is_available():
        device = torch.device("cuda")
        print("GPU is available")
else:
        device = torch.device("cpu")
        print("GPU is not available, using CPU")

GPU is available


In [3]:
import fitz  # PyMuPDF
import pdfplumber
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration
from datasets import Dataset
from sentence_transformers import SentenceTransformer
import torch
import faiss
import re

In [4]:
def extract_text_and_elements(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    equations = []
    tables = []
    table_count = 0
    equation_count = 0

    for page_num in range(len(document)):
        page = document.load_page(page_num)
        page_text = page.get_text()

        # Identify LaTeX equations
        page_text, eqs = extract_latex_equations(page_text, equation_count)
        equations.extend(eqs)
        equation_count += len(eqs)

        text += page_text

        # Extract tables using pdfplumber
        with pdfplumber.open(pdf_path) as pdf:
            page = pdf.pages[page_num]
            for table in page.extract_tables():
                tables.append(table)
                text += f" [TABLE_{table_count}] "
                table_count += 1

    return text, equations, tables

def extract_latex_equations(text, start_index):
    pattern = re.compile(r'(\$.*?\$|\[.*?\])')
    matches = pattern.findall(text)
    equations = []
    for i, match in enumerate(matches):
        placeholder = f" [EQUATION_{start_index + i}] "
        text = text.replace(match, placeholder, 1)
        equations.append(match)
    return text, equations

pdf_paths = ["/home/rdksuper/2005.11401v4.pdf", "/home/rdksuper/2106.09685v2.pdf", "/home/rdksuper/2305.14314v1.pdf"]
research_papers = [extract_text_and_elements(pdf_path) for pdf_path in pdf_paths]

In [5]:
data = {
    "documents": [paper[0] for paper in research_papers],
    "equations": [paper[1] for paper in research_papers],
    "tables": [paper[2] for paper in research_papers]
}
dataset = Dataset.from_dict(data)

In [6]:
model = SentenceTransformer('all-mpnet-base-v2')
model = model.to(device)
embeddings = model.encode(data['documents'], convert_to_tensor=True, device=device)



In [7]:
index_file = "faiss_index.bin"
gpu_index = faiss.read_index(index_file)
gpu_index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, gpu_index)
print("FAISS index loaded from disk.")

FAISS index loaded from disk.


In [8]:

cpu_index = faiss.index_gpu_to_cpu(gpu_index)
faiss.write_index(cpu_index, index_file)
print("FAISS index created and saved to disk.")

FAISS index created and saved to disk.


In [11]:
!pip install faiss-cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting faiss-cpu
  Using cached faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0.post1


In [12]:
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
cpu_index = faiss.read_index(index_file)
retriever = RagRetriever.from_pretrained(
        "facebook/rag-token-nq",
        index=cpu_index,
        passages=dataset["documents"]
        )
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
model = model.to(device)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

ImportError: 
RagRetriever requires the faiss library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/facebookresearch/faiss/blob/master/INSTALL.md and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [None]:
input_text = "What are the key findings of the research?"
inputs = tokenizer(input_text, return_tensors="pt").to(device)

with torch.no_grad():
    generated = model.generate(**inputs, num_beams=5, num_return_sequences=1)
output = tokenizer.batch_decode(generated, skip_special_tokens=True)

# Replace placeholders with actual equations and tables
for i, paper in enumerate(research_papers):
    for j, eq in enumerate(paper[1]):
        output = [text.replace(f"[EQUATION_{j}]", eq) for text in output]
    for k, table in enumerate(paper[2]):
        table_str = "\n".join(["\t".join(row) for row in table])
        output = [text.replace(f"[TABLE_{k}]", table_str) for text in output]

print(output)