In [1]:
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI
import os
from langchain.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer # type: ignore
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
# from langchain.globals import set_llm_cache
from langchain_community.cache import InMemoryCache
from langchain_core.prompts import PromptTemplate
from langchain.chains import RetrievalQA
load_dotenv()

  from tqdm.autonotebook import tqdm, trange


True

In [2]:
llm = AzureChatOpenAI(openai_api_version=os.environ.get("AZURE_OPENAI_VERSION", "2023-07-01-preview"),
    azure_deployment=os.environ.get("AZURE_OPENAI_DEPLOYMENT", "gpt4chat"),
    azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT", "https://gpt-4-trails.openai.azure.com/"),
    api_key=os.environ.get("AZURE_OPENAI_KEY"))

In [3]:
file_path = "biomed.pdf"
loader = PyPDFLoader(file_path)
pages = loader.load()

In [4]:
len(pages)

12

In [5]:
print(pages[0])

page_content='Citation: Pullakhandam, S.; McRoy, S.\nClassification and Explanation of Iron\nDeficiency Anemia from Complete\nBlood Count Data Using Machine\nLearning. BioMedInformatics 2024 ,4,\n661–673. https://doi.org/10.3390/\nbiomedinformatics4010036\nAcademic Editors: Pentti Nieminen\nand Jörn Lötsch\nReceived: 12 January 2024\nRevised: 15 February 2024\nAccepted: 22 February 2024\nPublished: 1 March 2024\nCopyright: ©2024 by the authors.\nLicensee MDPI, Basel, Switzerland.\nThis article is an open access article\ndistributed under the terms and\nconditions of the Creative Commons\nAttribution (CC BY) license (https://\ncreativecommons.org/licenses/by/\n4.0/).\nArticle\nClassification and Explanation of Iron Deficiency Anemia from\nComplete Blood Count Data Using Machine Learning\nSiddartha Pullakhandam and Susan McRoy *\nDepartment of Computer Science, University of Wisconsin-Milwaukee, Milwaukee, WI 53211, USA;\npullakh2@uwm.edu\n*Correspondence: mcroy@uwm.edu; Tel.: +1-414-229

In [14]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=700,
    chunk_overlap=50,
    length_function=len,
)

In [15]:
pages_text = [pages.page_content for pages in pages]
# the type of pages is documents but we need to convert them into list of strings for splitting the text.

In [16]:
documents = splitter.create_documents(pages_text)
# splitting the text into chunks of 250 characters and 50 characters overlap.

In [17]:
type(documents[0])

langchain_core.documents.base.Document

In [19]:
print(documents[1])

page_content='creativecommons.org/licenses/by/\n4.0/).\nArticle\nClassification and Explanation of Iron Deficiency Anemia from\nComplete Blood Count Data Using Machine Learning\nSiddartha Pullakhandam and Susan McRoy *\nDepartment of Computer Science, University of Wisconsin-Milwaukee, Milwaukee, WI 53211, USA;\npullakh2@uwm.edu\n*Correspondence: mcroy@uwm.edu; Tel.: +1-414-229-6695\nAbstract: Background: Currently, discriminating Iron Deficiency Anemia (IDA) from other anemia\nrequires an expensive test (serum ferritin). Complete Blood Count (CBC) tests are less costly and more\nwidely available. Machine learning models have not yet been applied to discriminating IDA but do'


In [20]:
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

In [124]:
# set_llm_cache(InMemoryCache())

In [21]:
def get_vectorstore(text_chunks):
    # Check if the FAISS index file already exists
    if os.path.exists("faiss_index"):
        embeddings = OpenAIEmbeddings()
        # Load the existing FAISS index
        vectorstore = FAISS.load_local("faiss_index", embeddings)
        print("Loaded existing FAISS index.")
    else:
        # Create a new FAISS index
        embeddings = OpenAIEmbeddings()
        vectorstore = FAISS.from_documents(documents=text_chunks, embedding=embeddings)
        # Save the new FAISS index locally
        vectorstore.save_local("faiss_index")
        print("Created and saved new FAISS index.")
    return vectorstore

In [22]:
retriever = get_vectorstore(documents).as_retriever()# here i am using the vector_db as retriever to retrieve the documents.

Created and saved new FAISS index.


In [23]:
template = """Use the context below to answer the question.
Keep the answer concise and to the point.
If you are unsure about the answer, just say i do not know the answer to the question do not create your own answer.
{context}

Question: {question}

Helpful Answer:"""
prompt = PromptTemplate.from_template(template)

In [24]:
chain_type_kwargs = {"prompt": prompt}
chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs=chain_type_kwargs,
)

In [28]:
print(chain.invoke("]How can this model be used in real world applications?"))

{'query': ']How can this model be used in real world applications?', 'result': "This model can be used in real world applications by assisting clinicians in disease diagnosis, specifically iron deficiency anemia (IDA). It can show the specific contributions of individual features to prediction, enabling clinicians to make informed decisions. Furthermore, it can potentially replace more expensive tests currently in use, such as serum ferritin, for diagnosing IDA. The model's application of machine learning can reveal which aspects of the complete blood count results contribute most to a diagnosis."}


In [None]:
"""
You are a language model designed to evaluate the responses of this documentation query system.
You will use a rating scale of 0 to 10, 0 being poorest response and 10 being the best.
Responses with “not specified” or “no specific mention” or “rephrase question” or “unclear” or no documents returned or empty response are considered poor responses.
Responses where the question appears to be answered are considered good.
Responses that contain detailed answers are considered the best.
Also, use your own judgement in analyzing if the question asked is actually answered in the response. Remember that a response that contains a request to “rephrase the question” is usually a non-response.
Please rate the question/response pair entered. Only respond with the rating. No explanation necessary. Only integers.
"""