In [17]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [18]:
# Extract text from PDF files
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [19]:
extracted_data = load_pdf_files("E:\work\medical-chatbot\data")

In [20]:
extracted_data

[Document(metadata={'producer': 'Microsoft® Word 2016', 'creator': 'Microsoft® Word 2016', 'creationdate': '2023-04-18T12:57:19+05:30', 'author': 'Gunja Chaturvedi', 'moddate': '2023-04-18T12:57:19+05:30', 'source': 'E:\\work\\medical-chatbot\\data\\Drugs Rules, 1945 (1).pdf', 'total_pages': 859, 'page': 0, 'page_label': '1'}, page_content='Central Drugs Standard Control Organization, Ministry of Health and Family Welfare, Govt. of India                                Page 1 of 859 \n*THE DRUGS RULES, 19451 \n[21st December, 1945] \n[As amended vide G.S.R. 823(E) dated 17-11-2022 (w.e.f. 01-08-2023] \n \nIn exercise of the powers conferred by 2[sections 6(2), 12, 33 and 33(N)] of \nthe Drugs  and Cosmetics Act, 1940 (23 of 1940), the Central Government is \npleased to make the following Rules:— \n \nPART I  \nPRELIMINARY \n \n1.    Short title, extent and commencement. —(1) These Rules may be \ncalled the Drugs 3[***] Rules, 1945. \n(2) They extend to the whole of India 4[***]. \n5[***

In [21]:
len(extracted_data)

859

In [22]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [25]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [26]:
minimal_docs

[Document(metadata={'source': 'E:\\work\\medical-chatbot\\data\\Drugs Rules, 1945 (1).pdf'}, page_content='Central Drugs Standard Control Organization, Ministry of Health and Family Welfare, Govt. of India                                Page 1 of 859 \n*THE DRUGS RULES, 19451 \n[21st December, 1945] \n[As amended vide G.S.R. 823(E) dated 17-11-2022 (w.e.f. 01-08-2023] \n \nIn exercise of the powers conferred by 2[sections 6(2), 12, 33 and 33(N)] of \nthe Drugs  and Cosmetics Act, 1940 (23 of 1940), the Central Government is \npleased to make the following Rules:— \n \nPART I  \nPRELIMINARY \n \n1.    Short title, extent and commencement. —(1) These Rules may be \ncalled the Drugs 3[***] Rules, 1945. \n(2) They extend to the whole of India 4[***]. \n5[***] \n2.    Definitions.—In these Rules, unless there is anything repugnant in the \nsubject or context— \n(a) "the Act" means the Drugs and Cosmetics Act, 1940 (23 of 19 40), as \namended from time to time; \n6[(aa) "biopharmaceutical cl

In [27]:
# Split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [28]:
texts_chunk = text_split(minimal_docs)
print(f"Number of chunks: {len(texts_chunk)}")

Number of chunks: 4151


In [15]:
texts_chunk

[Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION'),
 Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nA-B\n1'),
 Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow,Manager, Imaging and Multimedia\nContent\nRobyn V . Young,Project Manager, Imaging and\nMultimedia Content\nDean Dauphinais, Senior Editor, Imaging and'),
 Document(metadata={'source': 'data\\Medical_book.pdf'}, page_content='Multimedia

In [29]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding = download_embeddings()

  embeddings = HuggingFaceEmbeddings(
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [30]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [31]:
vector = embedding.embed_query("Hello world")
vector

[-0.03447727486491203,
 0.03102317824959755,
 0.006734970025718212,
 0.026108985766768456,
 -0.03936202451586723,
 -0.16030244529247284,
 0.06692401319742203,
 -0.006441489793360233,
 -0.0474504791200161,
 0.014758856035768986,
 0.07087527960538864,
 0.05552763119339943,
 0.019193334504961967,
 -0.026251312345266342,
 -0.01010954286903143,
 -0.02694045566022396,
 0.022307461127638817,
 -0.022226648405194283,
 -0.14969263970851898,
 -0.017493007704615593,
 0.00767625542357564,
 0.05435224249958992,
 0.0032543970737606287,
 0.031725890934467316,
 -0.0846213847398758,
 -0.02940601296722889,
 0.05159561336040497,
 0.04812406003475189,
 -0.0033148222137242556,
 -0.058279167860746384,
 0.04196927323937416,
 0.022210685536265373,
 0.1281888335943222,
 -0.022338971495628357,
 -0.011656315997242928,
 0.06292839348316193,
 -0.032876335084438324,
 -0.09122604131698608,
 -0.031175347045063972,
 0.0526994913816452,
 0.04703482985496521,
 -0.08420311659574509,
 -0.030056199058890343,
 -0.02074483036

In [32]:
print( "Vector length:", len(vector))

Vector length: 384


In [33]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [34]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [35]:
from pinecone import Pinecone 
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [36]:
pc

<pinecone.pinecone.Pinecone at 0x16ef3222080>

In [37]:
from pinecone import ServerlessSpec 

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension=384,  # Dimension of the embeddings
        metric= "cosine",  # Cosine similarity
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


index = pc.Index(index_name)

In [38]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embedding,
    index_name=index_name
)

In [39]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

# Add more data to the existing Pinecone index

In [29]:
dswith = Document(
    page_content="dswithbappy is a youtube channel that provides tutorials on various topics.",
    metadata={"source": "Youtube"}
)

In [30]:
docsearch.add_documents(documents=[dswith])

['48ace028-8de4-4429-9060-8282e3a47d6f']

In [40]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [41]:
retrieved_docs = retriever.invoke("What documents are required for renewing a drug manufacturing license?")
retrieved_docs

[Document(id='3b1daa6a-1c81-449b-aea7-8b41fd347015', metadata={'source': 'E:\\work\\medical-chatbot\\data\\Drugs Rules, 1945 (1).pdf'}, page_content='(5) The applicant shall make adequate a rrangements for the storage of drugs \nmanufactured by him. \n373[(6) The applicant shall furnish to the licensing authority, if required to do so, \ndata on the stability of drugs which are likely to deteriorate for fixing the date of \nexpiry which shall be printed on the labels of such drugs on the basis of the date \nso furnished.] \n374[(7) The applicant shall, while applying for a licence to manufacture'),
 Document(id='d95c4941-d470-453e-aeb1-8ffbf940d629', metadata={'source': 'E:\\work\\medical-chatbot\\data\\Drugs Rules, 1945 (1).pdf'}, page_content='any, as the licensing authority may specify and such record shall be open to \nthe inspection of any Inspector authorized in that behalf by the licensing \nauthority: \n60[Provided that in res pect of the sale or distribution of drugs specified

In [42]:
from langchain_openai import ChatOpenAI

chatModel = ChatOpenAI(model="gpt-4o")

In [43]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [None]:
system_prompt = (
    """ You are PharmaRegAssistant, an AI system that answers questions strictly using the ingested regulatory documents, especially the Drugs Rules, 1945 and CDSCO guidelines.

Instructions:
1. Use only the provided documents. Do not guess or use external knowledge.
2. Every response must contain ONLY:
   a) Answer: 1–3 sentences, concise and factual.
   b) Citation: exact rule/section/schedule AND the PDF page number where the information appears.
3. If the information is not found in the documents, respond with: "Not available in the Drugs Rules, 1945 corpus."
4. Do not add explanations, opinions, or any additional text beyond the two required fields.
5. Keep tone professional and neutral.

Output format:
Answer: <your answer>
Citation: Rule/Section/Schedule <identifier>, PDF Page <page number>
Context:
    "{context}"
    """
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [53]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [54]:
response = rag_chain.invoke({"input": "What are the conditions that must be fulfilled before a manufacturing license can be granted?"})
print(response["answer"])

a) Direct Answer: Before a manufacturing license is granted, the applicant must ensure that the factory premises comply with conditions prescribed in Schedule M. Additionally, adequate space, plant, and equipment for manufacturing operations must be provided.

b) Supporting Citations: Drugs Rules, 1945, Section 71. (2)-(3).

c) Explanation: The Drugs Rules, 1945 requires compliance with Schedule M, which details the essential requirements for factory premises, space, plant, and equipment to ensure proper manufacturing practices.

d) Action Steps:
   1. Review and ensure the factory layout meets the requirements outlined in Schedule M.
   2. Verify that adequate space, plant, and equipment are available for manufacturing.
   3. Prepare necessary documentation and submit it with the license application.


In [55]:
response = rag_chain.invoke({"input": "What form must be used when applying for a manufacturing license for Schedule C & C1 drugs?"})
print(response["answer"])

a) Direct Answer: The application for a manufacturing license for Schedule C & C1 drugs must be made using Form 27.

b) Supporting Citations: Refer to Rule 75(a) of the Drugs Rules, 1945.

c) Explanation: According to the Drugs Rules, 1945, Rule 75(a) specifies that Form 27 is used for applications to manufacture drugs listed in Schedules C and C1, except those falling under Part XB and Schedule X.

d) Action Steps: Obtain Form 27 from the Drugs Rules, 1945, ensure all required details are correctly filled, and submit it to the corresponding licensing authority.


In [56]:
response = rag_chain.invoke({"input": "How long should manufacturing and batch records be preserved?"})
print(response["answer"])

a) Direct Answer: Manufacturing and batch records must be preserved for a period of one year after the expiry of a batch or for three years, whichever is later.

b) Supporting Citations: This requirement is specified in the Drugs Rules, 1945, Clause (6) under the storage requirements.

c) Explanation: These records are crucial for verifying processes, product integrity, and compliance with regulatory standards. They help in traceability and accountability in case of quality or compliance audits.

d) Action Steps: Ensure all manufacturing and batch records are properly archived and accessible for at least three years or one year after the product's expiry, whichever is longer. Set up a record management system to track these durations automatically.


In [57]:
response = rag_chain.invoke({"input": "What is the procedure for sampling drugs by a drug inspector?"})
print(response["answer"])

a) Direct Answer: A drug inspector must inform the person from whom the drug sample is being taken about the purpose via a written intimation using Form 17. Additionally, the inspector can detain the drugs until the laboratory report is received.

b) Supporting Citations:
   - Form of intimation: Rule 56, Drugs Rules, 1945, Page 34.

c) Explanation: When a sample of a drug is taken for testing or analysis, the drug inspector must provide a written notice to the person from whom the sample is taken, specifying the purpose of sampling, using a predefined format (Form 17).

d) Action Steps: 
   1. Drug inspectors should carry Form 17 when taking samples.
   2. Present and fill out Form 17, informing the relevant person about the sampling intent.
   3. If drugs are detained, inform about the procedure and possible outcomes after receiving the laboratory report.
