# Author: Riya Chougule

### DATE: 13th February, 2024

### Description: This Python notebook presents a Retrieval Augmented Generation chatbot based on FDA (Food and Drug Administration) drug guidelines. With the FDA regularly publishing thousands of guidelines annually, including over 2100 documents solely dedicated to drugs, our chatbot streamlines user interactions by eliminating the need to manually explore extensive documentation. Leveraging input PDF data, the chatbot swiftly provides accurate responses to user queries based on FDA Drug guidelines. Additionally, the notebook integrates a user-friendly interface via Gradio, enhancing accessibility and usability.

In [None]:
!pip install PyPDF2
!pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain
!pip install langchain
!pip install openai
!pip install PyPDF
!pip install gradio

In [None]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.document_loaders import WebBaseLoader
from langchain import hub
import PyPDF2

In [None]:

# Set up environment variables if needed
os.environ['OPENAI_API_KEY'] = <YOUR OPENAI KEY>
os.environ['LANGCHAIN_API_KEY'] = <YOUR LANGCHAIN KEY>
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.langchain.plus'
os.environ['LANGCHAIN_PROJECT'] = 'Explore Evaluating index using LLM'

# Define functions for loading API keys and PDF processing
def load_api_key(file_path):
    try:
        with open(file_path, 'r') as file:
            api_key = file.read().strip()
            return api_key
    except FileNotFoundError:
        print("Error: File not found.")
    except Exception as e:
        print(f"Error: {e}")

In [None]:
def load_pdf_text(path):
    text = ""
    with open(path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        num_pages = len(reader.pages)
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

# Load API keys

file_path_openai = "/content/drive/MyDrive/Data/api_key.txt"
file_path_langchain = "/content/drive/MyDrive/Data/langchain-api.txt"
openai_api_key = load_api_key(file_path_openai)
langchain_api_key = load_api_key(file_path_langchain)

os.environ['OPENAI_API_KEY'] = openai_api_key
os.environ['LANGCHAIN_API_KEY'] = langchain_api_key

# Define the directory containing PDF files
pdf_directory = "/content/drive/MyDrive/Data/PDFs"


In [None]:
# Initialize the vector store
vectorstore = Chroma()

# Loop through the PDF files in the directory
for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        file_path = os.path.join(pdf_directory, filename)
        loader = PyPDFLoader(file_path)
        data = loader.load()

        # Split the PDF text into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
        splits = text_splitter.split_documents(data)

        vectorstore = Chroma.from_documents(documents=splits,
                                    embedding=OpenAIEmbeddings())

  warn_deprecated(


In [None]:
# Define the RAG framework components
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Post-processing function
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Define the RAG chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Function to generate response
def generate_response(Question):
    response = rag_chain.invoke(Question)
    return response

In [None]:
# USER INTERFACE
import gradio as gr
def generate_response(Question):
    # Define basic greetings and responses
    greetings = ["hi", "hello", "hey!", "Hola!", "Hey","Hello"]
    thank_you = ["thank you", "thanks", "thanks!"]

    # Convert the input question to lowercase for case-insensitive matching
    question = Question.lower()

    # Check if the input question is a basic greeting
    if question in greetings:
        response = "Hello! How can I assist you today?"
    # Check if the input question is an expression of gratitude
    elif question in thank_you:
        response = "You're welcome! If you have any more questions or if there's anything else I can assist you with, please don't hesitate to let me know."
    else:
        # Call your machine learning model here to generate a response based on the query
        response = rag_chain.invoke(question)
    return response

# Create the Gradio interface with custom CSS
iface = gr.Interface(
    fn=generate_response,
    inputs="text",
    outputs="text",
    title="<div style='padding-left: 520px; font-family: Zefani; font-size: 40px;'>RegulEase</div>",
    description="<div style='padding-left: 470px; font-size: x-large; font-family: Zefani; padding-left: 352px; '>Navagating FDA Compliance: Your Fast-Track Guide</div> <br><img src='https://today.uconn.edu/wp-content/uploads/2021/11/AdobeStock_333930165-scaled.jpeg' style='width: 850px; padding-left: 365px; height: 170px;'>",
    examples=[
        ["As a manufacturer are there any specific recommendations I need to follow for multi-site manufacturing for CAR-T cell therapy?"],
        ["Are there any alternative approaches or methodologies recognized by regulatory authorities for demonstrating bioequivalence, particularly for complex or modified-release formulations?"],
        ["While developing CART cell therapy for pediatric patients, what considerations I should take into account?"]
    ],
    theme="compact",    # Use the compact theme for buttons
    allow_flagging=True,  # Allow flagging examples
)

# Launch the Gradio interface
iface.launch(server_port=4409)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.

Sorry, we can't find the page you are looking for.


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://dc54934ed9caf9b134.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


