In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
import openai
import os
# from utils.vector_store import search_similar_chunks
import numpy as np

openai.api_key = os.getenv("OPENAI_API_KEY")

In [5]:
from unstructured.partition.auto import partition
import logging
import fitz
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List
import faiss
import numpy as np
from langchain.embeddings import HuggingFaceEmbeddings

In [6]:
def extract_text(file_path: str) -> str:
    try:
        doc = fitz.open(file_path)  # Open the PDF
        all_text = ""
        for page in doc:
            all_text += page.get_text("text") + "\n"
        doc.close()
        return all_text
    except Exception as e:
        logging.error(f"Error extracting text from {file_path}: {e}")
        raise HTTPException(status_code=500, detail=f"Error extracting text from PDF: {str(e)}")

In [7]:
filename = "../uploads/PBM_2.pdf"
extract_text(filename)

'”Neuronal Oscillations on Evolving Networks: Dynamics, Damage, Degradation,\nDecline, Dementia and Death”: A review and extension of Goreily et al. (2020)\nShayan Shafquat,1 Zongyuan Cai,1 Youssef Hafid,1 and Zakaria Taghi1\n1School of Psychology, University of Nottingham, Nottingham, NG7 2RD, UK\n(Dated: May 6, 2024)\nThe paper ”Neuronal Oscillations on Evolving Networks: Dynamics, Damage, Degradation, De-\ncline, Dementia, and Death” by Goreily et al. (2020) delves into the effects of neurodegenerative\ndiseases on brain neural networks [1]. The degradation caused by these diseases affects not only the\nnodes themselves but the strength of the connections between these nodes. The resulting connec-\ntome evolves as the disease progresses, with damage caused by the disease affecting the spreading of\nthe disease, although minutely. The progression of the disease is modelled through both toxic pro-\ntein accumulation and rest-state activity. In this paper, we attempt to recreate the re

In [40]:
def chunk_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[str]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
    )
    return text_splitter.split_text(text)

In [41]:
chunks = chunk_text(extract_text(filename))

In [33]:
len(chunks)

67

In [70]:
EMBEDDING_MODEL = "intfloat/e5-large-v2"

In [71]:
embeddings = HuggingFaceEmbeddings(
    model_name = EMBEDDING_MODEL,
)

INDEX_FILE = "faiss_index.bin"

def create_or_load_index(dimension: int):
    if os.path.exists(INDEX_FILE):
        index = faiss.read_index(INDEX_FILE)
    else:
        index = faiss.IndexFlatL2(dimension)
    return index

def save_index(index):
    faiss.write_index(index, INDEX_FILE)

def store_chunks(chunks: List[str], metadata: dict):
    vectors = embeddings.embed_documents(chunks)
    
    index = create_or_load_index(len(vectors[0]))
    
    index.add(np.array(vectors, dtype=np.float32))
    
    save_index(index)

  from tqdm.autonotebook import tqdm, trange


In [11]:
from langchain.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings()

INDEX_FILE = "faiss_index.bin"

def create_or_load_faiss_index(chunks: List[str]):
    if os.path.exists(INDEX_FILE):
        index = FAISS.load_local(INDEX_FILE, embeddings)
    else:
        vectors = embeddings.embed_documents(chunks)
        index = FAISS.from_texts(chunks, embeddings)
        index.save_local(INDEX_FILE)
    return index

def store_chunks(chunks: List[str], metadata: dict):
    index = create_or_load_faiss_index(chunks)
    index.save_local(INDEX_FILE)

def search_similar_chunks(query: str, k: int = 5):
    # Load the FAISS index
    index = create_or_load_faiss_index([])
    similar_chunks = index.similarity_search(query, k=k)
    return similar_chunks

In [12]:
store_chunks(chunks[:2], filename)

: 

In [None]:
vectors = embeddings.embed_documents(chunks[1])
print(vectors)

: 

In [42]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [43]:
chunks[:1]

['”Neuronal Oscillations on Evolving Networks: Dynamics, Damage, Degradation,\nDecline, Dementia and Death”: A review and extension of Goreily et al. (2020)\nShayan Shafquat,1 Zongyuan Cai,1 Youssef Hafid,1 and Zakaria Taghi1\n1School of Psychology, University of Nottingham, Nottingham, NG7 2RD, UK\n(Dated: May 6, 2024)\nThe paper ”Neuronal Oscillations on Evolving Networks: Dynamics, Damage, Degradation, De-\ncline, Dementia, and Death” by Goreily et al. (2020) delves into the effects of neurodegenerative\ndiseases on brain neural networks [1]. The degradation caused by these diseases affects not only the\nnodes themselves but the strength of the connections between these nodes. The resulting connec-\ntome evolves as the disease progresses, with damage caused by the disease affecting the spreading of\nthe disease, although minutely. The progression of the disease is modelled through both toxic pro-']

In [44]:
embeddings = OpenAIEmbeddings(api_key=openai.api_key)
# store in vector db
db = FAISS.from_texts(chunks[:10], embeddings)

                    api_key was transferred to model_kwargs.
                    Please confirm that api_key is what you intended.


In [46]:
hash_name = f"{filename.replace(' ', '-')}"
db.save_local(f'faiss_{hash_name}_index')
index_file = f'faiss_{hash_name}_index'

In [47]:
def search_similar_chunks(query: str, k: int = 5):
    index = FAISS.load_local(index_file, embeddings)
    similar_chunks = index.similarity_search(query, k=k)
    return similar_chunks

In [49]:
indices = search_similar_chunks("What is the main idea of the paper?", 5)

In [52]:
indices[0].page_content

'model of protein conversion[10]. We also aim to utilize\nthe model to investigate disease progression variation\nbetween hemispheres.\nC.\nAims\n• To replicate the neural mass and the resting-state\ndynamics model and extend the findings of Goriely'

In [68]:
def answer_question(query: str) -> str:
    indices = search_similar_chunks(query)
    # Assuming you have a way to map indices back to document chunks
    context = " ".join([idx.page_content for idx in indices])
    
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"Context: {context}\n\nQuestion: {query}\nAnswer:"}
        ],
        max_tokens=500
    )
    return response.choices[0]['message']['content']

In [69]:
answer_question("What is the main idea of the paper?")

The main idea of the paper is to explore the effects of neurodegenerative diseases on brain neural networks, specifically focusing on Alzheimer's disease. The paper describes a model that replicates neural mass dynamics and resting-state dynamics while studying disease progression variation between hemispheres. The model combines a protein spread model with a neural biomarker model, aiming to understand how toxic protein accumulation and connectivity changes impact disease progression. The study aims to replicate and extend previous research findings in order to provide insights into potential early indicators of Alzheimer's disease and better predict cognitive decline over time.


"The main idea of the paper is to explore the effects of neurodegenerative diseases on brain neural networks, specifically focusing on Alzheimer's disease. The paper describes a model that replicates neural mass dynamics and resting-state dynamics while studying disease progression variation between hemispheres. The model combines a protein spread model with a neural biomarker model, aiming to understand how toxic protein accumulation and connectivity changes impact disease progression. The study aims to replicate and extend previous research findings in order to provide insights into potential early indicators of Alzheimer's disease and better predict cognitive decline over time."