## Prerequisities

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls "/content/drive/My Drive/0.PhD/Jarvis-QA"
import sys
sys.path.append('/content/drive/My Drive/0')


test.jsonl  train.jsonl  validation.jsonl


In [None]:
! sudo apt -y -qq install tesseract-ocr libtesseract-dev
! sudo apt-get -y -qq install poppler-utils
! pip install -U unstructured==0.7.5 pdf2image==1.16.3 pytesseract==0.3.10 pdfminer.six
! pip install -U tensorflow_hub==0.13.0 tensorflow_text==2.12.1
! pip install jq
! pip install -U google-cloud-aiplatform==1.35.0 langchain==0.0.312
! pip install chromadb==0.4.15

In [None]:
 # Automatically restart kernel after installs so that your environment can access the new packages
 import IPython

 app = IPython.Application.instance()
 app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [None]:
#Colab Authentication
from google.colab import auth
auth.authenticate_user()

In [None]:
# Vertex AI
from google.cloud import aiplatform
import vertexai

print(f"Vertex AI SDK version: {aiplatform.__version__}")
import langchain

print(f"LangChain version: {langchain.__version__}")

Vertex AI SDK version: 1.35.0
LangChain version: 0.0.312


### GCP Configurations

In [None]:
# Initialize Google Cloud Project
PROJECT_ID = "alkali-gworks"  # @param {type:"string"}
REGION = "us-central1"  # @param {type:"string"}

# Initialize Vertex AI SDK
vertexai.init(project=PROJECT_ID, location=REGION)

In [None]:
#Library Imports
import urllib
import warnings
from pathlib import Path as p
from pprint import pprint

from langchain import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import VertexAIEmbeddings
from langchain.llms import VertexAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma


### Initialise Vertex AI Models

In [None]:
# Text Bison Model from Vertex AI based on PaLM v2
vertex_llm_text = VertexAI(model_name="text-bison@001")

# Text Embedding Gecko Model from Vertex AI based on PaLM v2
vertex_embeddings = VertexAIEmbeddings(model_name="textembedding-gecko@001")


## 1. Data Preprocessing/Loading

In [None]:
# Dataset is loaded to Google Drive path and imported
dataset_path = '/content/drive/MyDrive/0.PhD/Jarvis-QA/train.jsonl'
from pathlib import Path

pprint(Path(dataset_path).read_text())



### 1.1 JSON Loader (Langchain)

In [None]:
# Langchain based JSON loader to load dataset in JSON Lines format
dataset_path = '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl'
from langchain.document_loaders import JSONLoader
loader = JSONLoader(
    file_path=dataset_path,
    jq_schema='.answers[].sents[].text',
    text_content=False,
    json_lines=True)

data = loader.load()
data[:5]

[Document(page_content="Yes it's fine", metadata={'source': '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl', 'seq_num': 1}),
 Document(page_content="It's fairly normal to ask someone to proof", metadata={'source': '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl', 'seq_num': 2}),
 Document(page_content="-read your resume or to help with layout, so there's nothing wrong there.", metadata={'source': '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl', 'seq_num': 3}),
 Document(page_content="There's so many resumes sent out with elementary mistakes, so another set of eyes really helps in getting things right.", metadata={'source': '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl', 'seq_num': 4}),
 Document(page_content="If the recruitment is happening via an agency, the hiring company won't see your resume, they'll just see a copy/paste of the relevant parts (recruiters won't want the client seeing your contact details).", metadata={'source': '/content/driv

## 2. Retrieval Augmented Generation

Information Retrieval combined with Generative LLMs

### 2.1. Stuffing

Adding the entire document into the Prompt as Context



In [None]:
#Restricting Data to 10 Rows for catering to API limit (1024 Tokens)
context = "\n".join(str(p.page_content) for p in data[:10])
print("The total words in the context: ", len(context))
question = "Is it acceptable to format a cv for someone else?"


The total words in the context:  729


In [None]:
# Prompting
prompt_template = """Answer the question as precise as possible using the provided context. If the answer is
                    not contained in the context, say "answer not available in context" \n\n
                    Context: \n {context}?\n
                    Question: \n {question} \n
                    Answer:
                  """

prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)


In [None]:
# Unrestricted Data
context = "\n".join(str(p.page_content) for p in data)
print("The total words in the context: ", len(context))

# QA Chain
stuff_chain = load_qa_chain(vertex_llm_text, chain_type="stuff", prompt=prompt)
stuff_answer = stuff_chain(
    {"input_documents": data, "question": question}, return_only_outputs=True
)
pprint(stuff_answer)


The total words in the context:  1905079


InvalidArgument: ignored

In [None]:
#Restricting Data to 10 Rows for catering to API limit (1024 Tokens)
stuff_chain = load_qa_chain(vertex_llm_text, chain_type="stuff", prompt=prompt)
stuff_answer = stuff_chain(
    {"input_documents": data[:10], "question": question}, return_only_outputs=True
)
pprint(stuff_answer)


{'output_text': 'Yes, of course.'}


### 2.2. Map Reduce
Applies an LLM chain to each document individually (the Map step), treating the chain output as a new document.

Passes all the new documents to a separate combine documents chain to get a single output (the Reduce step)

In [None]:
#Restricting Data to 40 Rows for catering to API limit (1024 Tokens)
context = "\n".join(str(p.page_content) for p in data[:40])
print("The total words in the context: ", len(context))

#question = "What potential issues are highlighted by the user 'Old_Lamplighter' when it comes to proofreading and formatting a CV for someone else?"
question = "What potential issues are highlighted by the user 'Old_Lamplighter'?"


The total words in the context:  3250


In [None]:
#Prompting
question_prompt_template = """Answer the question as precise as possible using the provided context. If the answer is not contained in the context, say "answer not available in context"
                    Context: \n {context} \n
                    Question: \n {question} \n
                    Answer:
                    """
question_prompt = PromptTemplate(
    template=question_prompt_template, input_variables=["context", "question"]
)
pprint(question_prompt)


PromptTemplate(input_variables=['context', 'question'], template='Answer the question as precise as possible using the provided context. If the answer is not contained in the context, say "answer not available in context" \n                    Context: \n {context} \n\n                    Question: \n {question} \n\n                    Answer:\n                    ')


In [None]:
# Combined Prompt
combine_prompt_template = """Given the extracted content and the question, create a final answer.
If the answer is not contained in the context, say "answer not available in context. \n\n
Summaries: \n {summaries}?\n
Question: \n {question} \n
Answer:
"""
combine_prompt = PromptTemplate(
    template=combine_prompt_template, input_variables=["summaries", "question"]
)

In [None]:
# QA Chain

map_reduce_chain = load_qa_chain(
    vertex_llm_text,
    chain_type="map_reduce",
    return_intermediate_steps=True,
    question_prompt=question_prompt,
    combine_prompt=combine_prompt,
)

In [None]:
# Unrestricted Data with Mapreduce
map_reduce_outputs = map_reduce_chain({"input_documents": data, "question": question})
map_reduce_outputs




KeyboardInterrupt: ignored

In [None]:
# Restricting Data to 40 Rows for catering to API limit
map_reduce_outputs = map_reduce_chain({"input_documents": data[:40], "question": question})
map_reduce_outputs


{'input_documents': [Document(page_content="Yes it's fine", metadata={'source': '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl', 'seq_num': 1}),
  Document(page_content="It's fairly normal to ask someone to proof", metadata={'source': '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl', 'seq_num': 2}),
  Document(page_content="-read your resume or to help with layout, so there's nothing wrong there.", metadata={'source': '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl', 'seq_num': 3}),
  Document(page_content="There's so many resumes sent out with elementary mistakes, so another set of eyes really helps in getting things right.", metadata={'source': '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl', 'seq_num': 4}),
  Document(page_content="If the recruitment is happening via an agency, the hiring company won't see your resume, they'll just see a copy/paste of the relevant parts (recruiters won't want the client seeing your contact details).", metadata={

In [None]:
map_reduce_outputs['output_text']

'a resume may suggest that the candidate is a rocket scientist, while actually he worked at Goddard Space Flight Center as a janitor.'

In [None]:
#########

### 2.3. Refine

Looping over the input documents and iteratively updating its answer.

For each document, it passes all non-document inputs, the current document, and the latest intermediate answer to an LLM chain to get a new answer.



In [None]:
#Restricting Data to 50 Rows for catering to API limit
context = "\n".join(str(p.page_content) for p in data[:50])
print("The total words in the context: ", len(context))

question = "What potential issues are highlighted by the user 'Old_Lamplighter'?"


The total words in the context:  4186


In [None]:
refine_prompt_template = """
    The original question is: \n {question} \n
    The provided answer is: \n {existing_answer}\n
    Refine the existing answer if needed with the following context: \n {context_str} \n
    Given the extracted content and the question, create a final answer.
    If the answer is not contained in the context, say "answer not available in context. \n\n
"""
refine_prompt = PromptTemplate(
    input_variables=["question", "existing_answer", "context_str"],
    template=refine_prompt_template,
)
refine_prompt

PromptTemplate(input_variables=['question', 'existing_answer', 'context_str'], template='\n    The original question is: \n {question} \n\n    The provided answer is: \n {existing_answer}\n\n    Refine the existing answer if needed with the following context: \n {context_str} \n\n    Given the extracted content and the question, create a final answer.\n    If the answer is not contained in the context, say "answer not available in context. \n\n\n')

In [None]:
initial_question_prompt_template = """
    Answer the question as precise as possible using the provided context only.
    If the answer is not contained in the context, say "answer not available in context. \n\n
    Context: \n {context_str} \n
    Question: \n {question} \n
    Answer:
"""

initial_question_prompt = PromptTemplate(
    input_variables=["context_str", "question"],
    template=initial_question_prompt_template,
)
initial_question_prompt_template

'\n    Answer the question as precise as possible using the provided context only. \n    If the answer is not contained in the context, say "answer not available in context. \n\n\n    Context: \n {context_str} \n\n    Question: \n {question} \n\n    Answer:\n'

In [None]:
# Chain
refine_chain = load_qa_chain(
    vertex_llm_text,
    chain_type="refine",
    return_intermediate_steps=True,
    question_prompt=initial_question_prompt,
    refine_prompt=refine_prompt,
)


In [None]:
refine_outputs = refine_chain({"input_documents": data[:50], "question": question})
refine_outputs

{'input_documents': [Document(page_content="Yes it's fine", metadata={'source': '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl', 'seq_num': 1}),
  Document(page_content="It's fairly normal to ask someone to proof", metadata={'source': '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl', 'seq_num': 2}),
  Document(page_content="-read your resume or to help with layout, so there's nothing wrong there.", metadata={'source': '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl', 'seq_num': 3}),
  Document(page_content="There's so many resumes sent out with elementary mistakes, so another set of eyes really helps in getting things right.", metadata={'source': '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl', 'seq_num': 4}),
  Document(page_content="If the recruitment is happening via an agency, the hiring company won't see your resume, they'll just see a copy/paste of the relevant parts (recruiters won't want the client seeing your contact details).", metadata={

In [None]:
refine_outputs['output_text']

'answer not available in context.'

In [None]:
######

### 2.4. Vector Stores

Embed it and store the resulting embedding vectors in Vector DB

At query time, Embed the unstructured query and Retrieve the embedding vectors that are 'most similar' to the embedded query.

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200, chunk_overlap=50, add_start_index=True
)
documents = text_splitter.split_documents(data[:1000])
vector_index = Chroma.from_documents(documents, vertex_embeddings).as_retriever()


In [None]:
#question = "What potential issues are highlighted by the user 'Old_Lamplighter' when it comes to proofreading and formatting a CV for someone else?"
question = "What potential issues are highlighted by the user 'Old_Lamplighter'?"

docs = vector_index.get_relevant_documents(question)
context = "\n".join(str(p.page_content) for p in docs)
print("The total words in the context: ", len(context))
print(docs)


The total words in the context:  771
[Document(page_content="So, what can go bad is that the reviewer (or should we call him co-author?) might have his best intentions and embellish the candidate's profile, but the hiring manager would not be impressed.", metadata={'seq_num': 39, 'source': '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl'}), Document(page_content="So, what can go bad is that the reviewer (or should we call him co-author?) might have his best intentions and embellish the candidate's profile, but the hiring manager would not be impressed.", metadata={'seq_num': 39, 'source': '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl'}), Document(page_content="So, what can go bad is that the reviewer (or should we call him co-author?) might have his best intentions and embellish the candidate's profile, but the hiring manager would not be impressed.", metadata={'seq_num': 39, 'source': '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl', 'start_index': 0}), Do

In [None]:
question_prompt_template = """
                    Answer the question as precise as possible using the provided context. \n\n
                    Context: \n {context} \n
                    Question: \n {question} \n
                    Answer:
                    """
question_prompt = PromptTemplate(
    template=question_prompt_template, input_variables=["context", "question"]
)

# Combining Prompt
combine_prompt_template = """Given the extracted content and the question, create a final answer.
If the answer is not contained in the context, say "answer not available in context. \n\n
Summaries: \n {summaries}?\n
Question: \n {question} \n
Answer:
"""
combine_prompt = PromptTemplate(
    template=combine_prompt_template, input_variables=["summaries", "question"]
)

In [None]:
map_reduce_chain = load_qa_chain(
    vertex_llm_text,
    chain_type="map_reduce",
    return_intermediate_steps=True,
    question_prompt=question_prompt,
    combine_prompt=combine_prompt,
)

In [None]:
map_reduce_embeddings_outputs = map_reduce_chain(
    {"input_documents": docs, "question": question}
)


In [None]:
map_reduce_embeddings_outputs

{'input_documents': [Document(page_content="So, what can go bad is that the reviewer (or should we call him co-author?) might have his best intentions and embellish the candidate's profile, but the hiring manager would not be impressed.", metadata={'seq_num': 39, 'source': '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl'}),
  Document(page_content="So, what can go bad is that the reviewer (or should we call him co-author?) might have his best intentions and embellish the candidate's profile, but the hiring manager would not be impressed.", metadata={'seq_num': 39, 'source': '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl'}),
  Document(page_content="So, what can go bad is that the reviewer (or should we call him co-author?) might have his best intentions and embellish the candidate's profile, but the hiring manager would not be impressed.", metadata={'seq_num': 39, 'source': '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl', 'start_index': 0}),
  Document(page

In [None]:
map_reduce_embeddings_outputs['output_text']

"The reviewer might have his best intentions and embellish the candidate's profile, but the hiring manager would not be impressed."

#### Chroma with In Memory Disk

In [None]:
# save to disk
db2 = Chroma.from_documents(documents, vertex_embeddings, persist_directory="./chroma_db2")

In [None]:
db2_retriever = db2.as_retriever(search_type="mmr")
question = "What potential issues are highlighted by the user 'Old_Lamplighter' when it comes to proofreading and formatting a CV for someone else?"
retriever.get_relevant_documents(question)[0]


Document(page_content='Is it acceptable to format a cv for someone else?', metadata={'seq_num': 7, 'source': '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl'})

In [None]:
question = "What potential issues are highlighted by the user 'Old_Lamplighter'?"


In [None]:
db2_docs = db2.similarity_search_with_score(question)
db2_docs

[(Document(page_content="So, what can go bad is that the reviewer (or should we call him co-author?) might have his best intentions and embellish the candidate's profile, but the hiring manager would not be impressed.", metadata={'seq_num': 39, 'source': '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl', 'start_index': 0}),
  0.7055971026420593),
 (Document(page_content='There are plenty of people whose examples provide a questionable thought process - "I wanted to use it because it was new even though it might not have been a good fit/', metadata={'seq_num': 154, 'source': '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl', 'start_index': 0}),
  0.7229040861129761),
 (Document(page_content='Check out their Stack Overflow profile - are they actively asking/answering questions?', metadata={'seq_num': 263, 'source': '/content/drive/MyDrive/0.PhD/Jarvis-QA/validation.jsonl', 'start_index': 0}),
  0.7255362868309021),
 (Document(page_content='It can possibly be a problem, IF

In [None]:
db3 = Chroma(persist_directory="./chroma_db", embedding_function=vertex_embeddings)
ret_docs = db3.similarity_search(question)
context = "\n".join(str(p.page_content) for p in ret_docs)
print("The total words in the context: ", len(context))


The total words in the context:  723


In [None]:
prompt_template = """Answer the question as precise as possible using the provided context. If the answer is
                    not contained in the context, say "answer not available in context" \n\n
                    Context: \n {context}?\n
                    Question: \n {question} \n
                    Answer:
                  """

prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)
vector_store_chain = load_qa_chain(vertex_llm_text, chain_type="stuff", prompt=prompt)
vector_store_answer = stuff_chain(
    {"input_documents": ret_docs, "question": question}, return_only_outputs=True
)
pprint(vector_store_answer)


{'output_text': ' The reviewer might have his best intentions and embellish '
                "the candidate's profile, but the hiring manager would not be "
                'impressed.'}


## 3. State of the Art - LangChain Expression Language (LCEL)

In [None]:
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": vector_index | format_docs, "question": RunnablePassthrough()}
    | prompt
    | vertex_llm_text
    | StrOutputParser()
)

In [None]:
question = "What potential issues are highlighted by the user 'Old_Lamplighter'?"
for chunk in rag_chain.stream(question):
    print(chunk, end="", flush=True)



the reviewer (or should we call him co-author?) might have his best intentions and embellish the candidate's profile, but the hiring manager would not be impressed.

## 4. Exploration of Smallest LLMs

In [None]:
! pip install huggingface_hub



In [None]:
# get a token: https://huggingface.co/docs/api-inference/quicktour#get-your-api-token

from getpass import getpass

HUGGINGFACEHUB_API_TOKEN = getpass()

··········


In [None]:
import os

os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN

In [None]:
from langchain.llms import HuggingFaceHub
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

### 4.1 Flan-T5 :

In [None]:
repo_id = "google/flan-t5-xxl"
flan_llm = HuggingFaceHub(
    repo_id=repo_id, model_kwargs={"temperature": 0.5, "max_length": 1000}
)




In [None]:
flan_rag_chain = (
    {"context": vector_index | format_docs, "question": RunnablePassthrough()}
    | prompt
    | flan_llm
    | StrOutputParser()
)

In [None]:
question = "What potential issues are highlighted by the user 'Old_Lamplighter'?"

flan_rag_chain.invoke(question)

'answer not available in context'

### 4.2. Mistral 7B

In [None]:
repo_id = "mistralai/Mistral-7B-Instruct-v0.1"
mistral_llm = HuggingFaceHub(
    repo_id=repo_id, model_kwargs={"temperature": 0.5, "max_length": 1000}
)




In [None]:
mistral_rag_chain = (
    {"context": vector_index | format_docs, "question": RunnablePassthrough()}
    | prompt
    | mistral_llm
    | StrOutputParser()
)

In [None]:
question = "What potential issues are highlighted by the user 'Old_Lamplighter'?"

mistral_rag_chain.invoke(question)

" The potential issues highlighted by the user 'Old_Lamplighter' are that the reviewer"

### 4.3. Zephyr 7B

In [None]:
repo_id = "HuggingFaceH4/zephyr-7b-beta"
zephyr_llm = HuggingFaceHub(
    repo_id=repo_id, model_kwargs={"temperature": 0.5, "max_length": 1000}
)


In [None]:
zephyr_rag_chain = (
    {"context": vector_index | format_docs, "question": RunnablePassthrough()}
    | prompt
    | zephyr_llm
    | StrOutputParser()
)

In [None]:
question = "What potential issues are highlighted by the user 'Old_Lamplighter'?"
zephyr_rag_chain.invoke(question)

" The user 'Old_Lamplighter' highlights the potential issue of a reviewer (or co-author) embellishing a candidate's profile, which may not impress the hiring manager. This could result in the candidate's qualifications being overstated or exaggerated, potentially leading to false expectations or disappointment during the hiring process. The user's concern is that the hiring manager may not be impressed by the embellished profile, which could negatively impact the candidate's chances of being hired."

### 4.4. Other Models
`1. Alpaca 7B`

`2. Llama 7B / Open Llama 3B`

`3. BLING Series of Models`