In [135]:
import fitz  # PyMuPDF
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document


def extracted_data(pdf_path):
    doc = fitz.open(pdf_path)
    text_list = [Document(page_content=page.get_text()) for page in doc]
    doc.close()
    return text_list


In [None]:
def text_split(data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(data)
    return text_chunks

pdf_path = r"C:\Users\rithi\Desktop\GEN_AI\chat_bot\data\Medicines_for_Cats_and_Dogs_final.pdf"  

data = extracted_data(pdf_path)
text_chunks = text_split(data)

print("Length of Text Chunks:", len(text_chunks))

Length of Text Chunks: 132


In [None]:
import re
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def clean_and_tokenize_chunks(text_chunks):

    cleaned_texts = []
    all_tokens = []

    for doc in text_chunks:
        
        text = doc.page_content.lower()
        text = re.sub(r'[^a-z0-9\s]', '', text)  
        cleaned_texts.append(text)

    
        tokens = tokenizer.tokenize(text)
        all_tokens.append(tokens)

    return cleaned_texts, all_tokens


In [142]:
cleaned_texts, all_tokens = clean_and_tokenize_chunks(text_chunks)




In [143]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [144]:
embeddings = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


In [145]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [146]:
from dotenv import load_dotenv
load_dotenv()

True

In [147]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
GROQ_API_KEY=os.environ.get('GROQ_API_KEY')

In [154]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "chat-bot"


pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 

{
    "name": "chat-bot",
    "metric": "cosine",
    "host": "chat-bot-cbqrlrj.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [155]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

In [156]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [157]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [158]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1658054baf0>

In [159]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [160]:
retrieved_docs = retriever.invoke("What is Acne?")

In [161]:
retrieved_docs

[Document(id='99ab44a1-50d4-4d45-9b22-eb20a6a5b07d', metadata={}, page_content='affecting the ocular surface and include idoxuridine and trifluridine.'),
 Document(id='bfaee2bc-df51-4ee0-9fe1-058c72637dd4', metadata={}, page_content='a variety of indications, including but not limited to otitis externa, gingivitis, periodontal disease, \nsuperficial skin infections, topical disinfection of wounds and perioperative skin antisepsis. \n \n7) \nPovidone-iodine \nThis iodophor antiseptic is widely used as an alternative of chlorhexidine gluconate for \nperioperative skin antisepsis, post-operative application to surgical incisions, and emergency \nantisepsis in patients with minor lacerations, abrasions and burns.'),
 Document(id='f78cf8ce-d3ad-4878-b055-83fb04fc8687', metadata={}, page_content='also useful for treatment of infections caused by bacteria that are resistant to first line agents. \n \n \nTopical administration \n \nCore list \n \n1) \nFusidic acid \nThis fusidane is the first 

In [163]:
!pip install langchain_groq

Collecting langchain_groq
  Downloading langchain_groq-0.3.2-py3-none-any.whl.metadata (2.6 kB)
Downloading langchain_groq-0.3.2-py3-none-any.whl (15 kB)
Installing collected packages: langchain_groq
Successfully installed langchain_groq-0.3.2


In [164]:
from langchain_groq import ChatGroq
llm =ChatGroq(model_name="llama3-8b-8192",temperature=0.4, max_tokens=500)

In [165]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [166]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [167]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

I don't know. The provided context does not mention Acromegaly and gigantism.


In [168]:
response = rag_chain.invoke({"input": "What is stats?"})
print(response["answer"])

I don't know. The provided context appears to be related to veterinary medicine and essential medicines, but it doesn't mention "stats".
