# Generating Questions using ChatGPT4 and retrieval augmented generation to embed new knowledge into the language model

In [16]:
from langchain.prompts import ChatPromptTemplate
from dotenv import dotenv_values
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA

## Creating a prompt and loading Embedded Data
Now, we load the data that was previously retried and embedded into a vector database

In [41]:
assistant = """
You are an autoregressive language model that has been fine-tuned with instruction-tuning and RLHF. You are helping to answer questions based on provided information. This information is pertinent to the Kidney Precision Medicine Project (KPMP).

Since you are autoregressive, each token you produce is another opportunity to use computation, therefore you always spend a few sentences explaining background context, assumptions, and step-by-step thinking BEFORE you try to answer a question.
"""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", assistant),
        ("human",
         "Use the information provided to answer the following question: {question}")
    ]
)

In [42]:
def get_openai_key(path=""):
    """Gets OpenAI API key from .env file"""
    paths = ["/Users/samisaf/openai.env", "C:/Users/samis/openai.env", "C:/Users/samisaf/openai.env"]
    if len(path) > 0:
        return dotenv_values(path)['OPENAI_API_KEY']
    else:
        for p in paths:
            if len(dotenv_values(p)) > 0:
                return dotenv_values(p)['OPENAI_API_KEY']
    return None

In [57]:
openai_api_key = get_openai_key()
model="gpt-3.5-turbo"
temperature=0
max_tokens=1000

embedding =OpenAIEmbeddings(openai_api_key=openai_api_key)
db = Chroma(persist_directory='./db-kpmp-oct-23', embedding_function=embedding)
llm = ChatOpenAI(openai_api_key=openai_api_key, model=model, temperature=0, max_tokens=max_tokens)
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=db.as_retriever(), return_source_documents=True)

## Answering Questions with ChatGPT 3.5
Now, we used the prompts created in the previous step to generate the questions using ChatGPT 4

In [59]:
# Be careful running this line as it sends all prompts to open ai (takes time and consumes money)
question1 = "At what hemoglobin fall does the patient need to be hospitalized?"
result1 = qa(prompt.format(question=question1))
result1['result']


'Based on the information provided, there is no specific mention of the hemoglobin fall that would require hospitalization for a patient in the context of the Kidney Precision Medicine Project (KPMP). The information primarily focuses on the goals, design, and procedures of the study, as well as the eligibility criteria for participants with AKI or CKD. It does not provide specific details about hemoglobin levels or hospitalization criteria related to hemoglobin fall. Therefore, I cannot provide a specific answer to your question based on the given information.'

In [56]:
result1['source_documents']

[Document(page_content='FOR IRB USE ONLY  \nIRB ID #: 201902013 \nAPPROVAL DATE: 09/18/23  \nRELEASED DATE: 09/18/23  \nEXPIRATION DATE: 11/22/23  \n \n4 \nKPMP Doc ID: OPS006 v11  last updated 09.01.2023  • We will ask you about your health including asking you to \ncomplete a PROM IS Global Health Questionnaire and a \nPersonal History Questionnaire  \n• We will ask you about your understanding of your health using \na questionnaire called a Health Literacy questionnaire . \n• We will ask you about how you are feeling.  \n• We will ask you about parts of your life like your job. \n• After your biopsy,  we will ask you about how you feel about it .  \nc. We will measure your height and weight.  \nd. We will measure your blood pressure and temperature.  \ne. We will examine you for swelling caused by kidney problems.  \nf. You can’t join KPMP if you are pregnant. We will ask you if you are \npregnant before we do your kidney biopsy. We might  also do a \npregnancy test.  \ng. We may ta

In [46]:
question2 = "What are the inclusion criteria?"
result2 = qa(prompt.format(question=question2))
result2['result']

'Based on the information provided, the document does not explicitly mention the inclusion criteria for the Kidney Precision Medicine Project (KPMP). It primarily focuses on the goals and objectives of the study, as well as the collection and analysis of kidney biopsy tissue and clinical data. To obtain specific information about the inclusion criteria, it would be best to refer to the official KPMP website or contact the research team directly.'

In [47]:
result2['source_documents']


[Document(page_content='Document ID: OPS00 1   updated: 09/01/2023      6  \nVersion #: 10 1 Introductio n \n1.1 Study Overview  \nThe Kidney Precision Medicine Project (KPMP) is a prospective cohort study , whose goal is to use deep \nmolecular phenotypes of kidney biopsies , along with longitudinally collected clinical phenotypic data, in \norder to develop new disease ontologies , classification systems, and treatments for  acute kidney injury \n(AKI) and chronic kidney disease (CKD). Since its inception, the KPMP has sought out and included \nsubstantive patient -representative  feedback regarding disease experience, lack of innovation in new \nkidney disease therapies and patient tolerance for risk levels in balance with potential benefits both to \nthe individual and society. The KPMP has publicly and operationally committed itself  to always put \nparticipant s and their best interests first and this foundational principle informs and undergirds every \nfacet of the study. Both 

In [48]:
qa(prompt.format(question="hemoglobin drop"))['result']

"Based on the information provided, there is no specific mention of a hemoglobin drop in the context. Therefore, I don't have enough information to answer your question about a hemoglobin drop."