In [46]:
import os
from dotenv import load_dotenv
load_dotenv()
PINECONE_KEY = os.environ['PINECONE_API_KEY']



In [66]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.vectorstores import Pinecone 
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
import pinecone
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

In [None]:
# PINECONE_API_ENV = 

# Extract Data From Pdf 

In [29]:
#Load Data From Pdf Using PyPDFLoader
def load_pdf(data_directory):
    loader = DirectoryLoader(
        path=data_directory ,
        glob="*.pdf" ,
        loader_cls=PyPDFLoader
    )
    documents = loader.load() 
    
    return documents 

In [30]:
data = load_pdf(data_directory="D:\Data Science\GenAI\MedicalChatBotUsingLlama2\Data")

In [32]:
# data

# Convert Data Into Chunksa: -

In [33]:
def get_chunks_from_data(data):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size =500 ,
        chunk_overlap=20
    )
    splitted_chunk = splitter.split_documents(data)
    return splitted_chunk 



In [34]:
text_chunks = get_chunks_from_data(data=data)

In [35]:
len(text_chunks)

6983

In [38]:
text_chunks[3]

Document(page_content='Multimedia Content\nKelly A. Quin, Editor, Imaging and Multimedia Content\nLeitha Etheridge-Sims, Mary K. Grimes, Dave Oblender,\nImage Catalogers\nPamela A. Reed, Imaging Coordinator\nRandy Bassett, Imaging Supervisor\nRobert Duncan, Senior Imaging Specialist\nDan Newell, Imaging Specialist\nChristine O’Bryan, Graphic Specialist\nMaria Franklin, Permissions Manager\nMargaret A. Chamberlain, Permissions Specialist\nMichelle DiMercurio, Senior Art Director\nMike Logusz, Graphic Artist', metadata={'source': 'D:\\Data Science\\GenAI\\MedicalChatBotUsingLlama2\\Data\\The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf', 'page': 2})

# Convert Chunk In Embeddings and Store them In Vector DataBase

In [39]:
def get_embeddigng_from_huggingface():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [40]:
embeddings = get_embeddigng_from_huggingface()

In [43]:
type(embeddings.embed_query("Hello Pramod"))

list

In [44]:
len(embeddings.embed_query("Hello Pramod"))

384

In [68]:
#Pinecone code
# from pinecone import Pinecone 

pc = Pinecone(api_key=PINECONE_KEY)
index_name = "medicalchatbot"
import time
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]


# check if index already exists (it shouldn't if this is first time) 
if index_name not in existing_indexes:
    pc.create_index(
        name=index_name ,
        dimension=384 ,
        metric='cosine'
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)
    


In [69]:
index = pc.Index(index_name)  # Help to connect your index 
time.sleep(1)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [70]:
vectorstore_from_texts = PineconeVectorStore.from_texts(
        [t.page_content for t in text_chunks],
        index_name=index_name,
        embedding=embeddings
    )

In [74]:
print(len(text_chunks))

6983


langchain_pinecone.vectorstores.PineconeVectorStore

In [None]:
#TO Add More Record
# vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)
# vectorstore.add_texts(["More text to embed and add to the index!"])

In [55]:
#Create embedding from chunk
chunk_embedding = [embeddings.embed_query(text.page_content) for text in text_chunks]

In [71]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 7303}},
 'total_vector_count': 7303}

In [72]:
print(len(text_chunks))
print(len(chunk_embedding))

6983
6983


In [73]:
#Store Those Embeddings into vector
#You can use these method as well
# vectors = [(str(i), chunk_embedding[i], {'text': text_chunks[i].page_content}) for i in range(len(text_chunks))]
# index.upsert(vectors)

In [119]:
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)
query = "What is Nerve conduction testing—?"
docs = vectorstore.similarity_search(query ,k=3)
print(docs)

[Document(page_content='Feske, eds. New York: Churchill Livingstone, 1996.\nRichard RobinsonKEY TERMS\nNerve conduction velocity test —A test of the\nspeed of conduction of nerves, performed on the\nnerves in the arm and leg.Evoked responses seeEvoked potential\nstudies\nExanthema subitum seeRoseola\nExercise\nDefinition\nExercise is physical activity that is planned, struc-\ntured, and repetitive for the purpose of conditioning any\npart of the body. Exercise is utilized to improve health,'), Document(page_content='force. The needle may be repositioned in the same mus-\ncle for further recording. Other muscles may be tested as\nwell. A typical session lasts from 30–60 minutes.\nA slightly different test, the nerve conduction veloci-\nty test , is often performed at the same time with the same\nequipment. In this test, stimulating and recording elec-\ntrodes are used, and small electrical shocks are applied to\nmeasure the ability of the nerve to conduct electrical sig-'), Document(pag

In [88]:

def retrive_query(query, k):
    query_embedding = embeddings.embed_query(query)
    results = index.query(vector=query_embedding, top_k=k, include_metadata=True)
    matching_results = [result['metadata']['text'] for result in results['matches']]
    return matching_results

In [89]:
retrive_query(query=query ,k=3)


['Feske, eds. New York: Churchill Livingstone, 1996.\nRichard RobinsonKEY TERMS\nNerve conduction velocity test —A test of the\nspeed of conduction of nerves, performed on the\nnerves in the arm and leg.Evoked responses seeEvoked potential\nstudies\nExanthema subitum seeRoseola\nExercise\nDefinition\nExercise is physical activity that is planned, struc-\ntured, and repetitive for the purpose of conditioning any\npart of the body. Exercise is utilized to improve health,',
 'force. The needle may be repositioned in the same mus-\ncle for further recording. Other muscles may be tested as\nwell. A typical session lasts from 30–60 minutes.\nA slightly different test, the nerve conduction veloci-\nty test , is often performed at the same time with the same\nequipment. In this test, stimulating and recording elec-\ntrodes are used, and small electrical shocks are applied to\nmeasure the ability of the nerve to conduct electrical sig-',
 'formed to measure how fast impulses travel through the\

In [90]:
promt_template = """
Use the following information to answer the question.
If you dont know just say it dont try to make up an answer.

context : {context} 
question : {question}

Only helpful answer.
Helpful answer: 
"""

In [91]:
prompt = PromptTemplate(
    template= promt_template ,
    input_variables=["context" ,"question"]
)
chain_type_kwargs = {"prompt": prompt}

In [106]:
llm = CTransformers(
    model=r"D:\Data Science\GenAI\MedicalChatBotUsingLlama2\model\llama-2-7b-chat.ggmlv3.q4_0.bin" ,
    model_type = "llama" ,
    config={
        'max_new_tokens' :512 ,
        'temperature':0.3
    }
)

In [117]:
print(type(vectorstore_from_texts))
print(type(vectorstore))

<class 'langchain_pinecone.vectorstores.PineconeVectorStore'>
<class 'langchain_pinecone.vectorstores.PineconeVectorStore'>


In [125]:
# qa = RetrievalQA.from_chain_type(
#     llm=llm,
#     chain_type="stuff",
#     retriever=vectorstore_from_texts.as_retriever(search_kwargs={'k': 2}),
#     return_source_documents=True,
#     chain_type_kwargs=chain_type_kwargs  # Replace with appropriate kwargs for your chain_type
# )
# qa=RetrievalQA.from_chain_type(
#     llm=llm, 
#     chain_type="stuff", 
#     retriever=vectorstore.as_retriever(search_kwargs={'k': 2}),
#     return_source_documents=True, 
#     chain_type_kwargs=chain_type_kwargs) 

query = "what is acne"
vectorstore.similarity_search(query ,k=3) 

# query = "What is Nerve conduction testing—?"
# docs = vectorstore.similarity_search(query ,k=3)
# print(docs)

[Document(page_content='Corticosteroids —A group of anti-inflammatory\nsubstances often used to treat skin conditions.\nImmune response —The protective reaction by the\nimmune system against foreign antigens (sub-\nstances that the body perceives as potentially dan-\ngerous). The immune system combats disease by\nneutralizing or destroying antigens.contact dermatitis becomes a chronic and disabling con-\ndition that can have a profound effect on employability\nand quality of life.\nPrevention'),
 Document(page_content='ics and personal care products; latex items such as gloves\nand condoms; and formaldehyde. Many people find that\nthey are allergic to the nickel in inexpensive jewelry. ACD\nis usually confined to the area of skin that comes in contact\nwith the allergen, typically the hands or face. Symptoms\nrange from mild to severe and resemble those of ICD; a\npatch test may be needed to determine which kind of con-\ntact dermatitis a person is suffering from.\nDiagnosis'),
 Docume

In [103]:
from langchain.chains.question_answering import load_qa_chain

In [None]:

def retrive_query(query, k):
    query_embedding = embeddings.embed_query(query)
    results = index.query(vector=query_embedding, top_k=k, include_metadata=True)
    matching_results = [result['metadata']['text'] for result in results['matches']]
    return matching_results

In [112]:
#Search Answer from Vectors DB 
def retrive_answers(query):
    doc_search = retrive_query(query ,2)
    llm = CTransformers(
    model=r"D:\Data Science\GenAI\MedicalChatBotUsingLlama2\model\llama-2-7b-chat.ggmlv3.q4_0.bin" ,
    model_type = "llama" ,
    config={
        'max_new_tokens' :512 ,
        'temperature':0.3
    }
)
    chain = load_qa_chain(llm=llm ,chain_type='stuff')
    response = chain(
        {
            "input_documents" : doc_search ,
            "question": query
        }
    )
    return response

In [113]:
out_query = "What is Nerve conduction testing—?"
answer = retrive_answers(out_query)
print(answer)

['Feske, eds. New York: Churchill Livingstone, 1996.\nRichard RobinsonKEY TERMS\nNerve conduction velocity test —A test of the\nspeed of conduction of nerves, performed on the\nnerves in the arm and leg.Evoked responses seeEvoked potential\nstudies\nExanthema subitum seeRoseola\nExercise\nDefinition\nExercise is physical activity that is planned, struc-\ntured, and repetitive for the purpose of conditioning any\npart of the body. Exercise is utilized to improve health,', 'force. The needle may be repositioned in the same mus-\ncle for further recording. Other muscles may be tested as\nwell. A typical session lasts from 30–60 minutes.\nA slightly different test, the nerve conduction veloci-\nty test , is often performed at the same time with the same\nequipment. In this test, stimulating and recording elec-\ntrodes are used, and small electrical shocks are applied to\nmeasure the ability of the nerve to conduct electrical sig-']


AttributeError: 'str' object has no attribute 'page_content'