In [2]:
import os
import pinecone
from dotenv import load_dotenv
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

  from tqdm.autonotebook import tqdm


In [3]:
load_dotenv()

True

In [4]:
# # Setting an environment variable called PINECONE_API_KEY
# os.environ['PINECONE_API_KEY'] = os.getenv("PINECONE_API_KEY")

In [5]:
# Function for Loading pdf data
def pdf_load(data):
  loader = DirectoryLoader(
              data,
              glob="*.pdf",
              loader_cls=PyPDFLoader)
  
  docs = loader.load()
  return docs

In [None]:
# Load the pdf file
extracted_data = pdf_load("data/")

In [7]:
# extracted_data

In [8]:
# Function for splitting the loaded data
def split_text(extracted_data):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
  text_chunks = text_splitter.split_documents(extracted_data)
  return text_chunks

In [None]:
# Create chunks of pdf file
text_chunks = split_text(extracted_data)

In [10]:
print(f"Number of chunks created is: {len(text_chunks)}")

Number of chunks created is: 1330


In [11]:
# Download embedding model
def download_hf_embeddings():
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  return embeddings

In [None]:
# Create vector embeddings using huggingface "all-MiniLM-L6-v2" model
# It takes 3 mins to execute
embeddings = download_hf_embeddings()

In [13]:
embeddings

HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [14]:
# Experimenting with the embeddings we just created
query_result = embeddings.embed_query("Hello World")
print(f"Length of query result is: {len(query_result)}")

Length of query result is: 384


In [15]:
# query_result

In [None]:
# Getting API Key
api_key = os.getenv('PINECONE_API_KEY')

In [17]:
# # Getting API Key
# api_key = os.environ.get('PINECONE_API_KEY')

In [18]:
# Initializing the pinecone
pc = pinecone.Pinecone(api_key=api_key)

index_name = "genai-chatbot"
chunks_embeddings = PineconeVectorStore.from_texts([chunk.page_content for chunk in text_chunks], embeddings, index_name=index_name)

In [19]:
query = "What is a transformer?"

In [20]:
result = chunks_embeddings.similarity_search(query, k=3)

In [21]:
print("result\n", result)

result
 [Document(id='9f1acd25-b183-4ab0-befc-b9ab275ff4b1', metadata={}, page_content='The decoding part of the transformer starts with a similar process as the encoding part, where the \ntarget sequence (output sequence) undergoes input embedding and positional encoding. Let’s under-\nstand these blocks:\n• Output embedding (shifted right): For the decoder, the target sequence is “shifted right” by \none position. This means that at each position, the model tries to predict the token that comes'), Document(id='6299f47f-4374-4498-bfe4-2da8934bc822', metadata={}, page_content='The decoding part of the transformer starts with a similar process as the encoding part, where the \ntarget sequence (output sequence) undergoes input embedding and positional encoding. Let’s under-\nstand these blocks:\n• Output embedding (shifted right): For the decoder, the target sequence is “shifted right” by \none position. This means that at each position, the model tries to predict the token that comes'),

In [23]:
for i in range(len(result)):
  print(result[i].page_content)

The decoding part of the transformer starts with a similar process as the encoding part, where the 
target sequence (output sequence) undergoes input embedding and positional encoding. Let’s under-
stand these blocks:
• Output embedding (shifted right): For the decoder, the target sequence is “shifted right” by 
one position. This means that at each position, the model tries to predict the token that comes
The decoding part of the transformer starts with a similar process as the encoding part, where the 
target sequence (output sequence) undergoes input embedding and positional encoding. Let’s under-
stand these blocks:
• Output embedding (shifted right): For the decoder, the target sequence is “shifted right” by 
one position. This means that at each position, the model tries to predict the token that comes
The decoding part of the transformer starts with a similar process as the encoding part, where the 
target sequence (output sequence) undergoes input embedding and positional encod

In [24]:
prompt_template = """
Use the following pieces of information to answer the user's questions.
If you don't know the answer, just say that you don't know the answer, don't try to make up an answer.

Context: {context}
Question: {question}

Only return helpful answer and nothing else.
Helpful answer:
"""

In [None]:
# Initialize prompt template
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs = {"prompt": PROMPT}

In [21]:
llm = CTransformers(
  model="model\llama-2-7b-chat.ggmlv3.q4_0.bin",
  model_type="llama",
  config={"max_new_tokens": 700, "temperature": 0.7}
)

In [22]:
retriever=chunks_embeddings.as_retriever(search_kwargs={"k":2})

In [None]:
retrievalqa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=chunks_embeddings.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs,
)

In [24]:
retrievalqa

RetrievalQA(verbose=False, combine_documents_chain=StuffDocumentsChain(verbose=False, llm_chain=LLMChain(verbose=False, prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"), llm=CTransformers(client=<ctransformers.llm.LLM object at 0x00000210CA1AF7D0>, model='model\\llama-2-7b-chat.ggmlv3.q4_0.bin', model_type='llama', config={'max_new_tokens': 700, 'temperature': 0.7}), output_parser=StrOutputParser(), llm_kwargs={}), document_prompt=PromptTemplate(input_variables=['page_content'], input_types={}, partial_variables={}, template='Context:\n{page_content}'), document_variable_name='context'), retriever=VectorStoreRetriever(tags=['PineconeVectorStore', 'HuggingFaceEmbeddings'], vectorstore=<langchain_pinecone.vector

In [None]:
query = "What is a transformer?"

In [None]:
# It takes approx. 27 mins to execute in my device, however in colab it takes 7 mins
result1 = retrievalqa.invoke({"query":query})

In [26]:
result1

{'query': 'What is a transformer?',
 'result': ' A transformer is a type of neural network architecture introduced in the paper "Attention Is All You Need" by Vaswani et al. in 2017. It\'s primarily designed for sequence-to-sequence tasks, such as machine translation, text summarization, and language modeling. The transformer consists of an encoder and a decoder, each composed of multiple identical layers. Each layer in the encoder and decoder consists of self-attention mechanisms, feedforward networks, and layer normalization. The self-attention mechanism allows the model to attend to different parts of the input sequence simultaneously and weigh their importance, while the feedforward network processes the output of the self-attention mechanism to generate the final output.'}

In [27]:
result1["result"]

' A transformer is a type of neural network architecture introduced in the paper "Attention Is All You Need" by Vaswani et al. in 2017. It\'s primarily designed for sequence-to-sequence tasks, such as machine translation, text summarization, and language modeling. The transformer consists of an encoder and a decoder, each composed of multiple identical layers. Each layer in the encoder and decoder consists of self-attention mechanisms, feedforward networks, and layer normalization. The self-attention mechanism allows the model to attend to different parts of the input sequence simultaneously and weigh their importance, while the feedforward network processes the output of the self-attention mechanism to generate the final output.'

In [28]:
print(result1["result"])

 A transformer is a type of neural network architecture introduced in the paper "Attention Is All You Need" by Vaswani et al. in 2017. It's primarily designed for sequence-to-sequence tasks, such as machine translation, text summarization, and language modeling. The transformer consists of an encoder and a decoder, each composed of multiple identical layers. Each layer in the encoder and decoder consists of self-attention mechanisms, feedforward networks, and layer normalization. The self-attention mechanism allows the model to attend to different parts of the input sequence simultaneously and weigh their importance, while the feedforward network processes the output of the self-attention mechanism to generate the final output.
