# Install necessary packages

In [None]:
! pip install langchain
! pip install pypdf

In [None]:
! pip install sentence-transformers==2.2.2

In [None]:
! pip install langchain-openai

In [None]:
! pip install langchain-pinecone

In [None]:
! pip install -U langchain-community

# Load PDF documents from a directory

In [None]:
from langchain.document_loaders import PyPDFDirectoryLoader

In [None]:
loader = PyPDFDirectoryLoader("pdf")
data = loader.load()

# Split the documents into chunks for embedding

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_split = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
chunks = text_split.split_documents(data)

In [None]:
chunks

[Document(page_content='LLMs', metadata={'source': 'pdf/LLMs.pdf', 'page': 0}),
 Document(page_content='What is LLMs? \nA large Language model is a trained deep learning model that understands \nand generate text in a human like fashion. \nLLMs are good at Understanding and generating human language', metadata={'source': 'pdf/LLMs.pdf', 'page': 1}),
 Document(page_content='Why we call it Large Language Model? \nBecause of the size and complexity of the Neural Network as well as the size \nof the dataset that it was trained on. \nResearchers started to make these models large and trained on huge datasets \nThat they started showing impressive results like understanding complex \nNatural Language and generating language more eloquently than ever.', metadata={'source': 'pdf/LLMs.pdf', 'page': 2}),
 Document(page_content='What makes LLM so Powerful? \n●In case of LLM, one model can be used for a whole variety of tasks like:- \nText generation, Chatbot, summarizer, translation, code generat

# Display the number of chunks and the first chunk

In [None]:
len(chunks)

17

In [None]:
chunks[0]

Document(page_content='LLMs', metadata={'source': 'pdf/LLMs.pdf', 'page': 0})

# Embedding setup using HuggingFace

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  warn_deprecated(


In [None]:
embedding.embed_query("My name is saurav")

[-0.08500086516141891,
 -0.038315851241350174,
 -0.0669478252530098,
 0.05940002202987671,
 -0.020271778106689453,
 -0.01642041653394699,
 0.11205471307039261,
 0.005068871192634106,
 0.0358060747385025,
 -0.007086039986461401,
 -0.05973895639181137,
 -0.1300876885652542,
 0.09283287078142166,
 -0.02410919964313507,
 -0.024292904883623123,
 -0.07286591082811356,
 0.015663256868720055,
 0.02348797209560871,
 -0.034784287214279175,
 -0.05830112099647522,
 -0.026543153449892998,
 0.014836865477263927,
 -0.02004057914018631,
 -0.025269029662013054,
 -0.005322557408362627,
 -0.0052007646299898624,
 -0.03501150757074356,
 0.02824602648615837,
 -0.03221040591597557,
 -0.0296953022480011,
 0.06138113513588905,
 -0.01424354873597622,
 0.058718711137771606,
 0.05265409126877785,
 0.022563448175787926,
 0.0166702251881361,
 -0.14070264995098114,
 0.06423964351415634,
 -0.01154333632439375,
 0.04950723797082901,
 -0.02523973025381565,
 -0.07384567707777023,
 -0.00897225271910429,
 0.02771347947418

In [None]:
len(embedding.embed_query("My name is saurav"))

384

# Pinecone setup

In [None]:
from google.colab import userdata

PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')

In [None]:
import os

# Set the Pinecone API key in the environment variables
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

# Initialize Pinecone client and create an index if it doesn't exist

In [None]:
import os
from pinecone import Pinecone, ServerlessSpec
from langchain.vectorstores import Pinecone as PC
from pinecone import ServerlessSpec

pc = Pinecone(
        api_key=PINECONE_API_KEY
    )

    # Now do stuff
if 'pinecone' not in pc.list_indexes().names():
  pc.create_index(
            name='pinecone',
            dimension=384,
            metric='cosine',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-east-1'
            )
        )


In [None]:
# Connect to the existing Pinecone index

# docsearch = Pinecone.from_existing_index(index_name, embeddings)
# docsearch

In [None]:
# Prepare document chunks for Pinecone
doc_chunk = [t.page_content for t in chunks]

# Create a Pinecone vector store from the document chunks
doc_search = PC.from_texts(doc_chunk,embedding,index_name="pinecone")

# Query the Pinecone vector store

In [None]:
query = "What is LLM?"

docs = doc_search.similarity_search(query,k=3)

In [None]:
# Display the retrieved documents
docs

[Document(page_content='LLMs'),
 Document(page_content='What is LLMs? \nA large Language model is a trained deep learning model that understands \nand generate text in a human like fashion. \nLLMs are good at Understanding and generating human language'),
 Document(page_content='What makes LLM so Powerful? \n●In case of LLM, one model can be used for a whole variety of tasks like:- \nText generation, Chatbot, summarizer, translation, code generation \n& so on … \nSo, LLM is subset of Deep Learning & it has some properties merge with \nGenerative AI')]

# OpenAI setup

In [None]:
from langchain_openai import OpenAI

In [None]:
OPEN_AI_KEY = userdata.get('OPEN_AI_KEY')

In [None]:
llm = OpenAI(api_key=OPEN_AI_KEY)

# Set up a retrieval-based QA system

In [None]:
from langchain.chains import RetrievalQA

In [None]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=doc_search.as_retriever())

# Perform a QA query

In [None]:
query = "What is LLM"

print(qa.invoke(query))

{'query': 'What is LLM', 'result': ' LLM stands for Large Language Model, which is a type of deep learning model that is trained to understand and generate human-like language. It can be used for various tasks such as text generation, chatbots, summarization, translation, code generation, and more. '}
