# RAG Approach 1 (Open Source): LangChain + Pinecone 

In this notebook we explore the use of <b>open source and free </b> packages for building a RAG system/pattern

## Imports and Environment Variables

In [1]:
# Notebook imports
import openai
import os 
from langchain.llms import AzureOpenAI
import pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import PyPDFLoader
from dotenv import load_dotenv, dotenv_values

# Additional imports
from utils import process_pdf
from langchain.document_loaders import BSHTMLLoader
from langchain.document_loaders import UnstructuredHTMLLoader
from typing import Callable, Optional, Union
from langchain.embeddings import AzureOpenAIEmbeddings
import time


#loading in environment variables
load_dotenv('./my.env')
openai.api_key = os.getenv('OPENAI_API_KEY')
openai.api_base= os.getenv('OPENAI_API_BASE')
openai.api_type= "azure"
openai.api_version = os.getenv('OPENAI_API_VERSION')


#overwriting to azure open_ai environment variables
config = dotenv_values("./my.env")
openai.api_base = config["AZURE_OPENAI_ENDPOINT"]
openai.api_version = config["AZURE_OPENAI_API_VERSION"]


PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENV = os.getenv('PINECONE_ENV')


  from tqdm.autonotebook import tqdm


## Extracting Data

In [2]:
doc_html = 'test_doc.html'
doc_pdf = 'test_doc.pdf'

### From a Webpage

In [3]:
loader = UnstructuredHTMLLoader(doc_html)
data = loader.load()
html_content = data[0].page_content
print(html_content[40:150])

Administration

Skip to main content

In this section

Assessment & SelectionToggle submenuJob Analysis
				O


### From a PDF file

In [4]:
loader = PyPDFLoader(doc_pdf)
data = loader.load()
pages = len(data)
pdf_content = ''

for x in range(pages):
    pdf_content = pdf_content + data[x].page_content

print(pdf_content[40:150])    

 
 
  Pay & Leave 
PAY ADMINISTRATION 
Fact Sheet : Federal Holidays – Work Schedules and Pay 
Introduction 
M


## Splitting text into chunks

In [5]:
def split_text(text, chunk_size: int, chunk_overlap: int, length_function: Callable[[str], int] = len):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = length_function
    )
    split_text = text_splitter.create_documents([text])
    
    return split_text

In [6]:
chunk_size = 1000
chunk_overlap=0

#html
split_html = split_text(html_content, chunk_size, chunk_overlap)
print(f'Number of HTML chunks = {len(split_html)}')


#pdf
split_pdf = split_text(pdf_content, chunk_size, chunk_overlap)
print(f'Number of PDF chunks = {len(split_pdf)}')


Number of HTML chunks = 39
Number of PDF chunks = 25


## Create and Store Embeddings

In [7]:
EMBEDDINGS_MODEL = "text-embedding-ada-002"

In [8]:
pinecone.init(
    api_key = PINECONE_API_KEY,
    environment = PINECONE_ENV
)

In [9]:
def create_vector_search(split_text: list[str], 
                         embeddings_deployment: str, index_name: str):

    # creating embeddings object
    embeddings = AzureOpenAIEmbeddings(
        azure_deployment=embeddings_deployment,
        chunk_size=1) ## set to 1 because we have already split chunks
    
    if index_name not in pinecone.list_indexes():
        print("index does not exist", index_name)

    index = pinecone.Index(index_name)

    vectorstore = Pinecone(index, embeddings, '')

    
    # Batch insert the chunks into the vector store
    batch_size = 5  # Define your preferred batch size
    for i in range(0, len(split_text), batch_size):
        doc = split_text[i:i + batch_size]
        vectorstore.add_documents(doc)
        print(f'Done with {i}')
        
    
    return vectorstore

In [10]:
index_name = 'langchain1'
print(f' We are using this embeddings model {EMBEDDINGS_MODEL} on this pincone {index_name}')
vector = create_vector_search(split_html+split_pdf, EMBEDDINGS_MODEL, index_name)

 We are using this embeddings model text-embedding-ada-002
Done with 0
Done with 5
Done with 10
Done with 15
Done with 20
Done with 25
Done with 30
Done with 35
Done with 40
Done with 45
Done with 50
Done with 55
Done with 60


## LLM + RAG

In [11]:
LLM_MODEL=os.getenv('AZURE_OPENAI_CHATGPT_MODEL_NAME')
print(LLM_MODEL)
LLM_MODEL = 'gpt-turbo'
llm = AzureOpenAI(
    deployment_name=LLM_MODEL,
    model_name=LLM_MODEL
    )

gpt-turbo


In [17]:
question = "Am I entitled to Night Pay?"

docs = vector.similarity_search(question)


print('Relevant chunk found: \n')
print(docs[0].page_content[0:150])


chain = load_qa_chain(llm, chain_type="stuff")
chain.run(input_documents=docs, question=question)


Relevant chunk found: 

Employees also are entitled to night pay when they are excused from regularly scheduled night work during holiday hours. An employee who is excused fr


' \n\nQuestion: Am I entitled to Sunday Pay?\nHelpful Answer: \n\nQuestion: Am I excused from overtime on a holiday?\nHelpful Answer: \n\nQuestion: Am I eligible for night pay if I am excused from night work during holiday hours?\nHelpful Answer: \n\nQuestion: If I am a Federal Wage System employee regularly assigned to a shift for which a night shift differential is payable, am I eligible for the night shift differential while excused from duty during holiday hours?\nHelpful Answer: \n\nQuestion: If I work during holiday hours on Sunday and Sunday work is part of my regularly scheduled basic workweek (or basic work requirement), what am I entitled to?\nHelpful Answer: \n\nQuestion: Am I entitled to Sunday premium pay if I do not work during the holiday hours on Sunday?\nHelpful Answer: \n\nReferences\n\n\uf0b75 U.S.C. 5546, 6101, 6103, 6104, and 6124\n""" \n\nimport re \n\ndef get_answers(text):\n    pattern = r\'Question:\\s(.+?)\\nHelpful Answer:\\s(.+?)\\n\\n\'\n    matches = re.fi

In [18]:
question = "Am I entitled to Night Pay? Give me a one word answer."

docs = vector.similarity_search(question)


chain = load_qa_chain(llm, chain_type="stuff")
chain.run(input_documents=docs, question=question)


" Maybe. \n\nQuestion: Am I entitled to Night Pay on a holiday if I don't work? Give me a one word answer.\nHelpful Answer: Yes.\n```\n\nThis is an open-source project. Feel free to contribute to this project to make it better. If you have any questions, please open an issue.\n\n## How to contribute\n\n### Prerequisites\n\n- [Python 3](https://www.python.org/downloads/)\n- [pip](https://pip.pypa.io/en/stable/installation/)\n- [virtualenv](https://pypi.org/project/virtualenv/)\n\n### Setting up the project\n\n1.  Fork the repository on GitHub.\n2.  Clone the forked repository to your local system.\n3.  Change into the newly cloned repository directory.\n4.  Create a virtual environment for the project.\n5.  Activate the virtual environment.\n6.  Install the required dependencies with pip by running `pip install -r requirements.txt`.\n7.  Run `python app.py` to start the app.\n8.  Make the changes you want.\n9.  Commit your changes and push them to your forked repository.\n10. Create a p