# RAG Approach 1 (Open Source): LangChain + Pinecone 

In this notebook we explore the use of <b>open source and free </b> packages for building a RAG system/pattern

## Imports and Environment Variables

In [None]:
# Notebook imports
import openai
import os 
from langchain.llms import AzureOpenAI
import pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import PyPDFLoader
from dotenv import load_dotenv, dotenv_values

# Additional imports
from utils import process_pdf
from langchain.document_loaders import BSHTMLLoader
from langchain.document_loaders import UnstructuredHTMLLoader
from typing import Callable, Optional, Union
from langchain.embeddings import AzureOpenAIEmbeddings
import time


#loading in environment variables
load_dotenv('./my.env')
openai.api_key = os.getenv('OPENAI_API_KEY')
openai.api_base= os.getenv('OPENAI_API_BASE')
openai.api_type= "azure"
openai.api_version = os.getenv('OPENAI_API_VERSION')


#overwriting to azure open_ai environment variables
config = dotenv_values("./my.env")
openai.api_base = config["AZURE_OPENAI_ENDPOINT"]
openai.api_version = config["AZURE_OPENAI_API_VERSION"]


PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENV = os.getenv('PINECONE_ENV')


## Extracting Data

In [None]:
doc_html = 'test_doc.html'
doc_pdf = 'test_doc.pdf'

### From a Webpage

In [None]:
loader = UnstructuredHTMLLoader(doc_html)
data = loader.load()
html_content = data[0].page_content
print(html_content[40:150])

### From a PDF file

In [None]:
loader = PyPDFLoader(doc_pdf)
data = loader.load()
pages = len(data)
pdf_content = ''

for x in range(pages):
    pdf_content = pdf_content + data[x].page_content

print(pdf_content[40:150])    

## Splitting text into chunks

In [None]:
def split_text(text, chunk_size: int, chunk_overlap: int, length_function: Callable[[str], int] = len):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = length_function
    )
    split_text = text_splitter.create_documents([text])
    
    return split_text

In [None]:
chunk_size = 1000
chunk_overlap=0

#html
split_html = split_text(html_content, chunk_size, chunk_overlap)
print(f'Number of HTML chunks = {len(split_html)}')


#pdf
split_pdf = split_text(pdf_content, chunk_size, chunk_overlap)
print(f'Number of PDF chunks = {len(split_pdf)}')


## Create and Store Embeddings

In [None]:
EMBEDDINGS_MODEL = "text-embedding-ada-002"

In [None]:
pinecone.init(
    api_key = PINECONE_API_KEY,
    environment = PINECONE_ENV
)

In [None]:
def create_vector_search(split_text: list[str], 
                         embeddings_deployment: str, index_name: str):

    # creating embeddings object
    embeddings = AzureOpenAIEmbeddings(
        azure_deployment=embeddings_deployment,
        chunk_size=1) ## set to 1 because we have already split chunks
    
    if index_name not in pinecone.list_indexes():
        print("index does not exist", index_name)

    index = pinecone.Index(index_name)

    vectorstore = Pinecone(index, embeddings, '')

    
    # Batch insert the chunks into the vector store
    batch_size = 5  # Define your preferred batch size
    for i in range(0, len(split_text), batch_size):
        doc = split_text[i:i + batch_size]
        vectorstore.add_documents(doc)
        print(f'Done with {i}')
        
    
    return vectorstore

In [None]:
index_name = 'langchain1'
print(f' We are using this embeddings model {EMBEDDINGS_MODEL} on this pincone {index_name}')
vector = create_vector_search(split_html+split_pdf, EMBEDDINGS_MODEL, index_name)

## LLM + RAG

In [None]:
LLM_MODEL=os.getenv('AZURE_OPENAI_CHATGPT_MODEL_NAME')
print(LLM_MODEL)
LLM_MODEL = 'gpt-turbo'
llm = AzureOpenAI(
    deployment_name=LLM_MODEL,
    model_name=LLM_MODEL
    )

In [None]:
question = "Am I entitled to Night Pay?"

docs = vector.similarity_search(question)


print('Relevant chunk found: \n')
print(docs[0].page_content[0:150])


chain = load_qa_chain(llm, chain_type="stuff")
chain.run(input_documents=docs, question=question)


In [None]:
question = "Am I entitled to Night Pay? Give me a one word answer."

docs = vector.similarity_search(question)


chain = load_qa_chain(llm, chain_type="stuff")
chain.run(input_documents=docs, question=question)
