In [13]:
import openai
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import os


In [19]:
from dotenv import load_dotenv
from pinecone import Pinecone

# Load the .env file
load_dotenv()

# Get the Pinecone API key from environment variables
pinecone_api_key = os.getenv('PINECONE_API_KEY')
openai.api_key = os.getenv('OPENAI_API_KEY')

In [3]:
#Extract data from the PDF
def load_pdf(directory):
    # Initialize the PDF loader with the directory containing PDF files
    loader = PyPDFDirectoryLoader(directory)
    
    # Load the data
    documents = loader.load()

    return documents

In [4]:
# Assuming the PDFs are stored in a directory called "PDFs" within the current working directory
extracted_documents = load_pdf("data")


In [5]:
extracted_documents[0]

Document(metadata={'source': 'data\\Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf', 'page': 0}, page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION')

In [10]:
#Create text chunks
def text_splitter(extracted_documents):
    text_splitter= RecursiveCharacterTextSplitter(
    chunk_size=600,         
    chunk_overlap=80,      
    length_function=len,
    
        
)
    text_chunks= text_splitter.split_documents(extracted_documents) 
    
    return text_chunks

    

In [11]:
text_chunks= text_splitter(extracted_documents)

In [12]:
len(text_chunks)

8943

In [14]:
#download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [15]:
embeddings= download_hugging_face_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [16]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [21]:
# Assuming embeddings is an object with a method embed_query that returns an embedding vector
query_result = embeddings.embed_query("Hello world")

# Printing the length of the resulting query embedding
print("Length:", len(query_result))


Length: 384


In [17]:
import pinecone
from pinecone import ServerlessSpec
#from langchain.vectorstores import Pinecone as PineconeStore
from langchain_pinecone import PineconeVectorStore

In [20]:
pc = Pinecone(api_key=pinecone_api_key)

In [31]:
def create_index(index_name):
    pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [32]:
index_name= "medical-chatbot"

In [33]:
create_index(index_name)

In [34]:
# Create Embeddings for each of the Text Chunks
docsearch = PineconeVectorStore.from_texts(
    [t.page_content for t in text_chunks],  # Extract page content from each text chunk
    embeddings,  # Embedding model used to generate embeddings
    index_name=index_name  # Name of the Pinecone index to store vectors
)

In [35]:
# Define a query
query = "What are allergies"

# Perform a similarity search using the Pinecone index
docs = docsearch.similarity_search(query, k=3)

# Print the results
print("Result:", docs)

Result: [Document(id='1b70232d-8907-4cd5-b658-a08d27f6b803', metadata={}, page_content='Allergy —Altered body reaction, usually hypersen-\nsitivity, as a response to exposure to a specific sub-stance.\nAntibody —Any of a large number of proteins that\nare produced after stimulation by an antigen andact specifically against the antigen in an immuneresponse.\nAntihistamine —A drug that inhibits the actions of\nhistamine. Histamine causes dilatation of capillar-ies, contraction of smooth muscle, and stimulationof gastric acid secretion.\nAntitoxin —An antibody that is capable of neutraliz-'), Document(id='547412ee-e1cd-454c-a91d-b3d94fe21fb1', metadata={}, page_content='KEY TERMS\nAllergen —Any substance that irritates only those\nwho are sensitive (allergic) to it.\nAsthma —Wheezing (labored breathing) due to\nallergies or irritation of the lungs.\nDecongestant —Medicines that shrink blood ves-\nsels and consequently mucus membranes. Pseu-doephedrine, phenylephrine, and phenylpropano-lam

In [None]:
from some_library import PromptTemplate  # Replace 'some_library' with the actual import path

# Define the template with placeholders for dynamic content
template_prompt_template = "Context: {context}, Question: {q}"

# Create an instance of PromptTemplate with specified parameters
PROMPT = PromptTemplate(
    template=template_prompt_template,
    input_variables=["context", "q"]
)


In [36]:
# llm=CTransformers(model="meta-llama/Llama-3.2-1B-Instruct",
#                   model_type="llama",
#                   config={'max_new_tokens':512,
#                           'temperature':0.8})

llm = OpenAI()

  llm = OpenAI()


In [38]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())

query = "What is allergy"

qa.run(query)

' Allergy is an altered reaction of the body to a specific substance, causing symptoms such as itching, swelling, and inflammation.'