In [1]:
import pymupdf
from sentence_transformers import SentenceTransformer
import faiss
import nltk
from dotenv import load_dotenv
import cohere
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
co = cohere.Client(os.environ["COHERE_API_KEY"])

In [3]:
# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = pymupdf.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

In [4]:
def split_text_into_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

In [5]:
# Function to embed text using sentence-transformers
def embed_text(text, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    sentences = split_text_into_sentences(text)
    embeddings = model.encode(sentences)
    return sentences, embeddings

In [6]:
# Function to build FAISS index
def build_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

In [7]:
# Function to query FAISS index
def query_faiss_index(index, query_text, model, sentences, top_k=5):
    query_embedding = model.encode([query_text])
    _, indices = index.search(query_embedding, top_k)
    results = [(sentences[i]) for i in indices[0]]
    return results

In [8]:
# Main function to demonstrate the process
def retrieval(pdf_path, queries):
    # Step 1: Extract text from PDF
    text = extract_text_from_pdf(pdf_path)
    
    # Step 2: Embed text into vectors
    sentences, embeddings = embed_text(text)
    
    # Step 3: Build FAISS index
    index = build_faiss_index(embeddings)
    
    # Step 4: Query the index
    model = SentenceTransformer('all-MiniLM-L6-v2')
    docs = []
    for query in queries:
        results = query_faiss_index(index, query, model, sentences)
        for doc in results:
            docs.append({"snippet": doc})
    
    # Print the results
    return docs

In [9]:
def generate_search_queries(query):
    queries = co.chat(
            message=query,
            search_queries_only=True
        )
    return [query.text for query in queries.search_queries]

In [10]:
def respond(query, pdf_path):
    search_queries = generate_search_queries(query)
    docs = retrieval(pdf_path, search_queries)
    response = co.chat(
            model="command-r",
            message=query,
            documents=docs
        )
    return response

In [15]:
pdf_path = "Glyphosate.pdf"
query_text = "What was the overall conclusion with respect to glyphosate and risk of NHL"
response = respond(query_text, pdf_path)



In [16]:
print(response.text)

Multiple studies have examined the relationship between exposure to glyphosate, a common pesticide, and the risk of developing Non-Hodgkin's Lymphoma (NHL). Some studies suggest that there is a significant elevated risk of NHL associated with glyphosate use, especially with longer exposure durations. For instance, one study reported an elevated risk of NHL for ever-use of glyphosate (OR: 2.02) and especially for use longer than 10 days per year (OR: 2.36). Other studies, however, have not found a relationship between ever-use of glyphosate and NHL. 

Overall, there is some evidence suggesting a positive association between glyphosate exposure and the risk of NHL, especially for more frequent or longer-term use.
