<a href="https://colab.research.google.com/github/sonum02/type2diabetes-insight-rag/blob/main/Type2DiabetesAssistant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Get data from recent articles on Type 2 Diabetes in PDF files
!pip install unstructured[pdf]
!pip install -U langchain-community #Installing langchain community
from langchain.document_loaders import DirectoryLoader #Import after the kernel has been restarted

DATA_PATH= "/content/Type2DiabetesPapers"
def load_documents(data_path):
    loader = DirectoryLoader(data_path)
    documents = loader.load()
    return documents
mydocuments = load_documents(DATA_PATH)


In [None]:
#Scrape Mayo website article
import requests
from bs4 import BeautifulSoup
from langchain.schema import Document #Import the document class

# Funtion to get contents of the target website:
def get_website_content(url):
    response = requests.get(url)
    response.raise_for_status()  # Ensure successful response
    soup = BeautifulSoup(response.content, "html.parser")
    return soup.get_text()  # Extract text content

#Calling the function to get contents of the website into website_content variable
website_url = "https://www.mayoclinic.org/diseases-conditions/type-2-diabetes/symptoms-causes/syc-20351193"
website_content = get_website_content(website_url)

In [None]:
# Get Diabetes dataset file (Kaggle.json) from Kaggle to Kaggle drive and unzip
!pip install kaggle
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

! mkdir ~/.Kaggle
! cp /content/drive/MyDrive/kaggle.json ~/.Kaggle/
! chmod 600 ~/.Kaggle/kaggle.json
!kaggle datasets download mathchi/diabetes-data-set
! unzip diabetes-data-set.zip

In [None]:
# Load Kaggle dataset and append my docuemnts (recent papers) and Kaggle data
!pip install unstructured
from langchain_community.document_loaders import CSVLoader

DATA_PATH_KaggleData= "/content/diabetes.csv"
def load_documents(data_path_KaggleData):
    loader = CSVLoader(file_path=data_path_KaggleData)
    documents = loader.load()
    return documents

KaggleData = load_documents(DATA_PATH_KaggleData)

# Consolidating data from 3 sources: Recent research papers, Kaggle and Mayo website.
AllData = mydocuments + KaggleData + [Document(page_content=website_content)]

In [None]:
#RAG Retriever
from langchain_text_splitters import RecursiveCharacterTextSplitter

#1: Text Splitter/ Chunking
text_splitter= RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=500,
    length_function=len,
    add_start_index=True
)
#2: Splits/ Chunks
chunks= text_splitter.split_documents(AllData)

#Creating embeddings in VectorDB
!pip install chromadb
!pip install tiktoken
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from google.colab import userdata
embeddings = OpenAIEmbeddings(openai_api_key=userdata.get("OpenAIKey"))

#3: Vector Store
VectorDB=Chroma.from_documents(
    documents=chunks,
    embedding=embeddings
)

#4: Retriever
retriever=VectorDB.as_retriever()

In [None]:
#LLM
import os
from google.colab import userdata
from langchain.chat_models import ChatOpenAI


# Attempt to get the API key from userdata, fallback to environment variable
openai_api_key = userdata.get('OpenAIKey')
if openai_api_key is None:
    openai_api_key = os.environ.get('OPENAI_API_KEY')
    if openai_api_key is None:
        raise ValueError("Please set either the 'OpenAIKey' in userdata or the 'OPENAI_API_KEY' environment variable.")

#Connect to LLM/ GPT model
model = ChatOpenAI(openai_api_key=openai_api_key, model="gpt-3.5-turbo")

In [None]:
# RAG Chain
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(model, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
# RAG chain with Chat history
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    model, retriever, contextualize_q_prompt
)
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


question_answer_chain_wHistory = create_stuff_documents_chain(model, qa_prompt)

rag_chain_wHistory = create_retrieval_chain(history_aware_retriever, question_answer_chain_wHistory)

In [None]:
# GUI Gradio
!pip install gradio
import gradio as gr
from langchain_core.messages import AIMessage, HumanMessage

chat_history = []

def predict(message, history):
    response = rag_chain_wHistory.invoke({"input": message,"chat_history": chat_history})
    chat_history.extend(
    [
        HumanMessage(content=message),
        AIMessage(content=response["answer"]),
    ]
)
    return response['answer']


demo = gr.ChatInterface(
    predict,
    chatbot=gr.Chatbot(height=200, type="messages"),
    textbox=gr.Textbox(
        placeholder="Ask me anything about Type2 Diabetes",
        container=False,
        scale=7
    ),
    title="Type2 Diabetes Assistant",
    description="I can help answer questions about Type2 Diabetes symptoms, treatment and prevention.",
    theme="soft",
    )

demo.launch(share='True')