In [1]:
print("Ok")

Ok


In [2]:
!python -V

Python 3.10.14


In [3]:
!pip freeze | grep langchain 

langchain==0.0.225
langchain-core==0.2.8
langchain-pinecone==0.1.1
langchainplus-sdk==0.0.20


### Import required libraries, modules and packages

In [4]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import pinecone
from langchain_pinecone import PineconeVectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import os


#### Setting up Pinecone

In [5]:
# Load the .env file
load_dotenv()

# Retrieve the API key from the environment variable
api_key = os.getenv("PINECONE_API_KEY")

#### Extracting the text from the pdf file

In [6]:
# Extracting the text from the pdf file
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()
    
    return documents

data_extract = load_pdf("data/")

#data_extract

#### Splitting the text into chunks

In [7]:
# Splitting the text into chunks
def split_text(data_extract):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    text_chunks = text_splitter.split_documents(data_extract)
    
    return text_chunks

text_chunks = split_text(data_extract)
print(f"The length of the data chunk is {len(text_chunks)}")

The length of the data chunk is 7093


#### Embedding the text chunks

In [8]:
# Download the embedding model
def download_HugginFace_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2")
    
    return embeddings

embeddings = download_HugginFace_embeddings()

embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

In [9]:
# Testing the embedding model
query_results = embeddings.embed_documents("Hello World")
print("length", len(query_results))
#query_results

length 11


#### Create a serverless index 

In [10]:
# Initialize the Pinecone client
pc = Pinecone(api_key=api_key)

index_name = "medbot"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384, 
        metric="cosine", 
        spec=ServerlessSpec(
            cloud="aws", 
            region="us-east-1"
        ) 
    ) 

# Create embeddings for each of the text chunks and upload to Pinecone
#Embed each chunk and upsert the embeddings into a distinct namespace called wondervector5000
namespace = "wondervector5000"

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
    namespace=namespace
)

#### Use Pinecone’s list and query operations to look at one of the records:

In [None]:
index = pc.Index(index_name)

for ids in index.list(namespace=namespace):
    query = index.query(
        id=ids[0], 
        namespace=namespace, 
        top_k=1,
        include_values=True,
        include_metadata=True
    )
    print(query)