In [9]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
import os
from dotenv import load_dotenv

from typing import List

In [2]:
def load_pdf_files(data_directory: str) -> List[str]:
    """
    Load PDF files from a specified directory.

    Args:
        data_directory (str): The directory path containing PDF files.

    Returns:
        List[str]: A list of text content extracted from the PDF files.
    """
    # Create a DirectoryLoader instance to load PDF files
    pdf_loader = DirectoryLoader(
        data_directory,            # Specify the directory to search for PDF files
        glob="*.pdf",              # Specify the pattern for PDF files
        loader_cls=PyPDFLoader    # Use PyPDFLoader class to load PDF content
    )

    # Load PDF files and extract text content
    docs = pdf_loader.load()

    # Return the extracted text content from PDF files
    return docs

In [3]:
loaded_pdf_content = load_pdf_files("Data/")

In [5]:
loaded_pdf_content

[Document(page_content='1\nCARE SERIES BOOKLET\nWHAT YOU NEED TO KNOW AND \nDO ABOUT AN SMA DIAGNOSISGUIDE FOR HEALTHCARE PROVIDERSCURE SMA', metadata={'source': 'Data\\1805262021_HCP_Cure-SMA_What-You-Need-to-Know_Booklet_v2.pdf', 'page': 0}),
 Document(page_content='2Dear Healthcare Provider, \nYou are likely receiving this guide because one of your patients is suspected to have \nspinal muscular atrophy (SMA) following newborn screening. SMA is a rare genetic \ncondition that many health professionals never see.  \nThis guide is intended to provide you with a foundation for understanding SMA. \nHere are the most important things to know: \n• Treatment is available.  \n• You may need to act quickly. \nDo not wait for signs of SMA to consider treatment options with your patient’s \nparents or caregivers. If you wait until you notice the muscle weakness, which is the \nhallmark sign of SMA, your patient will have already lost some function that may \nnever be regained.  \nWe have resou

In [6]:
def text_chunking(loaded_pdf_content: str) -> List[str]:
    """
    Chunk the loaded PDF content into smaller segments.

    Args:
        loaded_pdf_content (str): The text content loaded from PDF files.

    Returns:
        List[str]: A list of text chunks, each representing a smaller segment of the original content.
    """
    # Initialize a RecursiveCharacterTextSplitter instance
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=200,          # Specify the size of each text chunk
        chunk_overlap=20         # Specify the overlap between adjacent chunks
    )

    # Split the loaded PDF content into smaller segments (chunks)
    chunks = splitter.split_documents(loaded_pdf_content)

    # Return the list of text chunks
    return chunks

In [7]:
doc_chunks = text_chunking(loaded_pdf_content)
print("Total chunks =", len(doc_chunks))

Total chunks = 1335


In [7]:
def download_embeddings_model() -> HuggingFaceEmbeddings:
    """
    Download and initialize the embeddings model.

    Returns:
        HuggingFaceEmbeddings: An instance of the embeddings model.
    """
    # Initialize the HuggingFaceEmbeddings instance with the specified model name
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-mpnet-base-v2"
    )

    # Return the initialized embeddings model
    return embeddings

In [8]:
embeddings = download_embeddings_model()

  return self.fget.__get__(instance, owner)()


In [10]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-mpnet-base-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

In [None]:
# Load environment variables from a .env file
load_dotenv()

# Retrieve Pinecone API key and environment from environment variables
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV')

In [13]:
#Initializing the Pinecone
pinecone.init(api_key=PINECONE_API_KEY,
              environment=PINECONE_API_ENV)

index_name="chatbot"

docsearch = Pinecone.from_texts([t.page_content for t in doc_chunks], embeddings, index_name=index_name)

In [14]:
docsearch = Pinecone.from_existing_index(index_name, embeddings)
query = "What is SMA"
docs = docsearch.similarity_search(query, k=3)
print(docs)

[Document(page_content='SMA is classified into three main types—based', metadata={}), Document(page_content='prevent the development or slow the progression of some features of SMA; efficacy  is improved when treatment', metadata={}), Document(page_content='Spinal muscular atrophy (SMA) is a genetic \nneuromuscular disorder affecting approximately \n1 in 10,000  live births. It is estimated to affect \nroughly 10,000 children and adults  in the United', metadata={})]


In [15]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [16]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])

chain_type_kwargs={"prompt": PROMPT}

In [17]:
llm=CTransformers(model="Model\llama-2-7b-chat.ggmlv3.q8_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.8})

In [18]:
qa=RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(search_kwargs={'k': 3}),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

In [19]:
while True:
    user_input=input(f"Input Prompt:")
    result=qa({"query": user_input})
    print("Response : ", result["result"])


Response :  SMA stands for Spinal Muscular Atrophy.
Response :  SMA, or Spinal Muscular Atrophy, is a genetic disorder that affects the nerve cells responsible for controlling voluntary muscle movement, particularly those in the limbs, face, and throat. It results in progressive muscle weakness and atrophy, leading to difficulty breathing, swallowing, and mobility issues. There are three types of SMA, ranging from mild to severe, and it is usually diagnosed in infancy or childhood.
Response :  There are three main types of spinal muscular atrophy (SMA): SMA type 1, SMA type 2, and SMA type 3. The characteristics of each type are as follows:
* SMA type 1: This is the most severe form of the disease, with symptoms present from birth or before. Motor function deteriorates rapidly, leading to significant disability and respiratory problems.
* SMA type 2: This type has a later onset than SMA type 1, typically appearing between 6 months and 2 years of age. Symptoms progress more slowly than 