In [24]:
from langchain_community.llms import Ollama
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pinecone import Pinecone as p1, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain.document_loaders import PyPDFLoader
import os

In [25]:
llm = Ollama(model="llama3")

In [26]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    
)

In [27]:
# Load PDF from the data folder
pdf_path = r'C:\Users\musta\OneDrive\Desktop\Medical Chatbot LLM\Model\data\med.pdf'
if not os.path.isfile(pdf_path):
    raise FileNotFoundError(f"File path {pdf_path} is not a valid file or url")

loader = PyPDFLoader(pdf_path)
documents = loader.load()

In [28]:
import os
os.environ['PINECONE_API_KEY'] = '39c3b55b-2ae4-44ee-a9cd-83a99876c828'
pc = p1(
    api_key=os.environ.get("PINECONE_API_KEY")
)

In [29]:
embeding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")



In [30]:
all_chunks = []
for document in documents:
    chunks = text_splitter.split_text(document.page_content)
    all_chunks.extend(chunks)

In [31]:
index_name="test1"
index = pc.Index("test1")  
for i, t in zip(range(len(all_chunks)), all_chunks):
   query_result = embeding.embed_query(t)
   print(i,t)
   index.upsert(
   vectors=[
        {
            "id": str(i),  # Convert i to a string
            "values": query_result, 
            "metadata": {"text":str(t)} # meta data as dic
        }
    ],
    namespace="real" 
)


0 TheGALE
ENCYCLOPEDIA
ofMEDICINE
SECOND EDITION
1 TheGALE
ENCYCLOPEDIA
ofMEDICINE
SECOND EDITION
JACQUELINE L. LONGE, EDITOR
2 DEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR
VOLUME
A-B1
3 STAFF
Jacqueline L. Longe, Project Editor
Deirdre S. Blanchfield, Associate Editor
4 Christine B. Jeryan, Managing Editor
Donna Olendorf, Senior Editor
5 Stacey Blachford, Associate Editor
Kate Kretschmann, Melissa C. McDade, Ryan
6 Thomason, Assistant Editors
Mark Springer, Technical Specialist
Andrea Lopeman, Programmer/Analyst
7 Barbara J. Yarrow, Manager, Imaging and Multimedia
Content
8 Content
Robyn V . Young, Project Manager, Imaging and
Multimedia Content
9 Multimedia Content
Dean Dauphinais, Senior Editor, Imaging and
Multimedia Content
10 Multimedia Content
Kelly A. Quin, Editor, Imaging and Multimedia Content
11 Leitha Etheridge-Sims, Mary K. Grimes, Dave Oblender,
Image Catalogers
12 Image Catalogers
Pamela A. Reed, Imaging Coordinator
Randy Bassett, Imaging Supervisor
13 Robert Duncan, Senior 

KeyboardInterrupt: 

In [32]:
embeding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [33]:
len(documents)

637

In [34]:
from langchain_pinecone import PineconeVectorStore

index_name = "test1"

vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeding,namespace="real")

In [35]:
retrieval_qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [37]:
retrieval_qa.run("what is Behcet’s syndrome?")

"Based on the provided context, a helpful answer would be:\n\nBehcet's syndrome is a chronic disease that involves multiple symptoms and affects various parts of the body."