In [None]:
print("Ok")

In [None]:
!python -V

In [None]:
!pip freeze | grep langchain 

### Import required libraries, modules and packages

In [None]:
%pwd

In [None]:
import os

In [None]:
os.chdir("..")

In [None]:
%pwd

In [None]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import pinecone
from langchain_pinecone import PineconeVectorStore
from langchain_pinecone import PineconeVectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv


#### Setting up Pinecone

In [None]:
# Load the .env file
load_dotenv()

# Retrieve the API key from the environment variable
api_key = os.getenv("PINECONE_API_KEY")

#### Extracting the text from the pdf file

In [None]:
# Extracting the text from the pdf file
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()
    
    return documents

data_extract = load_pdf("data/")

#data_extract

#### Splitting the text into chunks

In [None]:
# Splitting the text into chunks
def split_text(data_extract):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    text_chunks = text_splitter.split_documents(data_extract)
    
    return text_chunks

text_chunks = split_text(data_extract)
print(f"The length of the data chunk is {len(text_chunks)}")

#### Embedding the text chunks

In [None]:
# Download the embedding model
def download_HugginFace_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2")
    
    return embeddings

embeddings = download_HugginFace_embeddings()

embeddings

In [None]:
# Testing the embedding model
query_results = embeddings.embed_documents("Hello World")
print("length", len(query_results))
#query_results

#### Create a serverless index 

In [None]:
# Initialize the Pinecone client
pc = Pinecone(api_key=api_key)

index_name = "medbot"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384, 
        metric="cosine", 
        spec=ServerlessSpec(cloud="aws", region="us-east-1") 
) 

# Create embeddings for each of the text chunks and upload to Pinecone
#Embed each chunk and upsert the embeddings into a distinct namespace called wondervector5000
namespace = "wondervector5000"

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings,  
    namespace=namespace)

#### Use Pinecone’s list and query operations to look at one of the records:

In [None]:
index = pc.Index(index_name)

for ids in index.list(namespace=namespace):
    query = index.query(
        id=ids[0], 
        namespace=namespace, 
        top_k=1,
        include_values=True, 
        include_metadata=True) 
    
    print(query)

In [None]:
#If we already have an index we can load it like this
# Initialize the Pinecone Vector Store
docsearch = PineconeVectorStore.from_existing_index(index_name=index_name, embedding=embeddings)

# Example query
query = "What are Allergies?"

# Perform similarity search
docs = docsearch.similarity_search(query, k=3)

# Print the results
print("Result:", docs)

In [None]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [None]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [None]:
llm=CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.8})

In [None]:
docsearch = PineconeVectorStore.from_existing_index(index_name, embeddings)