In [34]:
# Medical Encylopedia stuff
# !pip install sentence-transformers==2.2.2 langchain flask pypdf python-dotenv pinecone[grpc] langchain-pinecone langchain_community langchain_openai langchain_experimental setuptools

In [35]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [36]:
def load_pdf_file(data):
    loader= DirectoryLoader(data, glob = "*.pdf", loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [37]:
extracted_data=load_pdf_file("/content/")

In [38]:
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=40)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [39]:
text_chunks=text_split(extracted_data)
len(text_chunks)

40272

In [40]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [41]:
!pip install -U sentence-transformers



In [42]:
embeddings = download_hugging_face_embeddings()

In [57]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from google.colab import userdata

pc = Pinecone(api_key=userdata.get('PINECONE_API_KEY'))

index_name = "medical-encyclopedia"

os.environ["PINECONE_API_KEY"] = userdata.get("PINECONE_API_KEY")

In [58]:
pc.list_indexes().names()

['medical-encyclopedia']

In [59]:
if index_name not in pc.list_indexes().names():
        pc.create_index(
            name=index_name,
            dimension=384,
            metric='cosine',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-east-1'
            )
        )

In [60]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings,
)

In [61]:
# Access local embeddings
from langchain_pinecone import PineconeVectorStore
from pydantic import BaseModel

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings,
)

In [62]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x782cb0a2ce90>

In [63]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [66]:
# Example
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='f5c1f038-d2ad-46df-9e38-63d374abd207', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 55.0, 'page_label': '26', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': '/content/gale_encyclopedia.pdf', 'total_pages': 4505.0}, page_content='Researchers, Inc. Reproduced by permission.)\n26 GALE ENCYCLOPEDIA OF MEDICINE\nAcne'),
 Document(id='2be46934-aa4d-4062-a49b-881106cf84c4', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 55.0, 'page_label': '26', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': '/content/gale_encyclopedia.pdf', 'total_pages': 4505.0}, page_content='tion and dries up acne pimples.\nSebaceous follicles— A structure found within the\nskin that houses the oil-producing glands and hair\nfollicles, where pimples form.\nSebum— An oily skin moisturizer produced by\nsebaceous glands.\nTretinoi