In [19]:
# import langchain
import os
import openai
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import OnlinePDFLoader
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

import pandas as pd
from dotenv import load_dotenv
import os
import openai
import dotenv


config = dotenv.dotenv_values(".env")
openai.api_key = config['OPENAI_API_KEY']
openai_api_key = config['OPENAI_API_KEY']


load_dotenv()


infile_path='urls_list.txt'

with open(infile_path, 'r') as infile:
    urls_data = infile.readlines()



data_dict = {}
for i, url_link in enumerate(urls_data):
    url_link = url_link.strip()
    if str(url_link).endswith('pdf') or str(url_link).__contains__('ch-api'):
        loader = OnlinePDFLoader(url_link)
        text_data = loader.load()
        text_data[0].metadata['source'] = url_link
        data_dict.update({
            i: text_data
        })
    else:
        loader = WebBaseLoader(url_link)
        text_data = loader.load()
        text_data[0].metadata['source'] = url_link
        data_dict.update({
            i: text_data
        })

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
all_splits_pypdf_texts=[]
all_splits_pypdf_texts_src=[]
for k,v in data_dict.items():
    text_data = data_dict[k]
    texts = text_splitter.split_documents(text_data)
    all_splits_pypdf_texts.extend([d.page_content for d in texts])
    all_splits_pypdf_texts_src.extend([d.metadata['source'] for d in texts])

embedding = OpenAIEmbeddings()
vector_store = FAISS.from_texts(all_splits_pypdf_texts, embedding)

embed_list = []
for i, document in enumerate(all_splits_pypdf_texts): 
    embedding_rec = embedding.embed_documents([document])[0]
    embed_list.append(embedding_rec)

df = pd.DataFrame({"text": all_splits_pypdf_texts, "embedding": embed_list, "src":all_splits_pypdf_texts_src})
df.to_csv("embedding.csv")

# save document chunks and embeddings
SAVE_PATH = "data/doc_embedding.csv"
df.to_csv(SAVE_PATH, index=False)

In [None]:


# imports
import ast  # for converting embeddings saved as strings back to arrays
from openai import OpenAI # for calling the OpenAI API
import pandas as pd  # for storing text and embeddings data
import tiktoken  # for counting tokens
import os # for getting API token from env variable OPENAI_API_KEY
from scipy import spatial  # for calculating vector similarities for search

# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [None]:
# search function
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 5
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response.data[0].embedding
    strings_and_relatednesses = [
        (row["text"]+"||"+row['src'], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

In [None]:
df

In [None]:
query_embedding_response = client.embeddings.create(
    model=EMBEDDING_MODEL,
    input=query,
)


In [None]:
query_embedding = query_embedding_response.data[0].embedding

In [None]:
df

In [None]:
# examples
strings, relatednesses = strings_ranked_by_relatedness("gestational diabetes", df, top_n=2)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness=:.3f}")
    display(string)

In [None]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))


def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    # print(strings,relatednesses)
    introduction = """Use the below articles on diabetes to answer the subsequent question with respect to healthcare context. \
    If the answer cannot be found in the articles, write a response in an emphatic and understanding tone \
    For example: "I couldn't find an exact match for your query. Could you rephrase the questions related to diabetes ?" """
    
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_article = f'\n\nNext article:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question, (strings,relatednesses)


def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message, (strings,relatednesses) = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You answer questions about the diabetes."},
        {"role": "user", "content": message},
    ]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response.choices[0].message.content
    return response_message ,(strings,relatednesses)



In [None]:
ans = ask("What is gestational diabetes and how is it diagnosed?")
print(ans[0])

In [None]:
print(ans[1][0][1], ans[1][1][1])

In [None]:
ans = ask("What are some healthy eating tips for people with diabetes?")
# print(ans)
ans[0]

In [None]:
print(ans[1][0][1], ans[1][1][1])

In [None]:
ans = ask("How can my outpatient bill for diabetes be covered? ")
print(ans[0])

In [None]:
print(ans[1][0][1], ans[1][1][1])

In [None]:
ans = ask("what is the blood sugar level for senior citizens having diabetic condition ?")
print(ans[0])

In [None]:
print(ans[1][0][1], ans[1][1][1])

In [None]:
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains import SimpleSequentialChain
openai_api_key = os.environ["OPENAI_API_KEY"]
llm = OpenAI(temperature=1, openai_api_key=openai_api_key)


In [None]:
template = """
This is conversation with a human. Answer the questions you get based on the knowledge you have.
If you don't know the answer, just say that you don't, don't try to make up an answer.
Chat History:
{chat_history}
Follow Up Input: {question}
"""
prompt_template = PromptTemplate(input_variables=["question"], template=template)

# Holds my 'location' chain
location_chain = LLMChain(llm=llm, prompt=prompt_template)

In [None]:
relevant_docs = knowledge_base.similarity_search("what is gestation diabetes ?", k=5)
# print(relevant_docs[2].page_content)

In [29]:
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains import SimpleSequentialChain
# Eval!
from langchain.evaluation.qa import QAEvalChain
from langchain.chains import RetrievalQA

openai_api_key = os.environ["OPENAI_API_KEY"]
llm = OpenAI(temperature=1, openai_api_key=openai_api_key)


qa = RetrievalQA.from_chain_type(llm=llm, 
                                 chain_type="stuff", 
                                 retriever=vector_store.as_retriever())



ground_truth_question_answers = [
    {'question': "What is gestational diabetes and how is it diagnosed?",
     'answer': 'Gestational diabetes is a type of diabetes that develops during pregnancy and usually goes away after delivery. It is diagnosed using a 3-point 75 g oral glucose tolerance test (OGTT) at 24 to 28 weeks of gestation, unless the woman has already been diagnosed with diabetes or pre-diabetes. It is important to also screen for pre-existing diabetes in the first trimester and after delivery, as women with a history of GDM are at increased risk of developing type 2 diabetes later on in life.'
    },
    {
        'question': "What are some healthy eating tips for people with diabetes?",
        'answer':"Some healthy eating tips for people with diabetes include:\n\n1. Focus on a balanced diet that includes carbohydrates, protein, and fats, with an emphasis on managing carbohydrate intake to control blood sugar levels.\n2. Choose healthier cooking methods like steaming, baking, boiling, or grilling to prepare meals.\n3. Opt for whole grains over refined grains, such as replacing white rice with brown rice.\n4. Select lean meats and remove visible fats before cooking to reduce saturated fat intake.\n5. Use natural seasonings like herbs and spices instead of excessive salt.\n6. Incorporate vegetables and fruits as the main components of your meals, making up at least 50% of your plate.\n7. Stay hydrated with water as your primary drink choice and avoid sugary beverages.\n8. Plan meals ahead, make a shopping list, and opt for healthier products during festivals and celebrations to maintain healthy eating habits.\n9. Communicate your boundaries politely when faced with peer pressure to indulge in unhealthy foods during social gatherings.\n\nRemember, personalized nutritional advice from a healthcare professional, such as a dietitian, can further enhance your diabetes management through tailored dietary recommendations." 
    },
    {
    'question': "How can my outpatient bill for diabetes be covered?",
    'answer': "Your outpatient bill for diabetes can be covered through various means, including government subsidies, employee benefits/private medical insurance, and the use of MediSave through the Chronic Disease Management Programme (CDMP). The bill can be further offset with government subsidies available at public specialist outpatient clinics, polyclinics, and through schemes like the Community Health Assist Scheme (CHAS), Pioneer Generation (PG), and Merdeka Generation (MG) outpatient subsidies. Additionally, patients can tap on accounts of immediate family members for MediSave, and those aged 60 and above can use MediSave for the 15% co-payment under CDMP."
    }
]

    
chain = RetrievalQA.from_chain_type(llm=llm, 
                                    chain_type="stuff", 
                                    retriever=vector_store.as_retriever(), 
                                    input_key="question")

predictions = chain.apply(ground_truth_question_answers)
print(predictions)
# Start your eval chain
eval_chain = QAEvalChain.from_llm(llm)
eval_outputs = eval_chain.evaluate(ground_truth_question_answers,
                                     predictions,
                                     question_key="question",
                                     prediction_key="result",
                                     answer_key='answer')
print(eval_outputs)



[{'question': 'What is gestational diabetes and how is it diagnosed?', 'answer': 'Gestational diabetes is a type of diabetes that develops during pregnancy and usually goes away after delivery. It is diagnosed using a 3-point 75 g oral glucose tolerance test (OGTT) at 24 to 28 weeks of gestation, unless the woman has already been diagnosed with diabetes or pre-diabetes. It is important to also screen for pre-existing diabetes in the first trimester and after delivery, as women with a history of GDM are at increased risk of developing type 2 diabetes later on in life.', 'result': ' Gestational diabetes is diabetes that is diagnosed during pregnancy, usually in the second or third trimester. It is characterized by high blood sugar levels that may be caused by decreased insulin sensitivity or underlying beta-cell dysfunction. Gestational diabetes is usually identified through a 3-point 75 g oral glucose tolerance test (OGTT) at 24 to 28 weeks of gestation, unless the woman has already bee