# RAG Data Pipeline 

## Key goals of this notebook: 
- Read PDF files
- Convert into splits
- Create a Vector Database and Store Embeddings
- Query the Vector Database

## Install and load some packages

In [1]:
!pip install -qU PyPDF2 langchain tiktoken sentence_transformers langchain-community pypdf openai python-dotenv chromadb langchain_openai

In [2]:
import os
import openai
import sys
from dotenv import load_dotenv, find_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import TokenTextSplitter, RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import Chroma

## Load OPEN AI API Key

In [3]:
# if you're starting from scartch, you might need to load openAI api key to .env file using following code..  
# !echo "OPENAI_API_KEY=replacemykeyhere" >> .env

# Load environment variables from .env file
_ = load_dotenv(find_dotenv())

# Access the OpenAI API key from environment variables
openai.api_key = os.environ['OPENAI_API_KEY']

print("OpenAI API Key Loaded:", openai.api_key is not None)

OpenAI API Key Loaded: True


## Reading of PDF files 

In [4]:
file_path = "2405.10276v1.pdf"

# Load the PDF document
loader = PyPDFLoader(file_path)
docs = loader.load()

# Verify that the document is loaded correctly
if docs:
    print(f"Loaded {len(docs)} document(s) from the PDF.")
    print(f"First document content preview: {docs[0].page_content[:50]}")
else:
    print("No documents loaded.")

Loaded 9 document(s) from the PDF.
First document content preview: Revisiting OPRO: The Limitations of Small-Scale LL


## Splitting of PDF files into chunks

In [5]:
# Split the document into chunks using TokenTextSplitter

chunk_size = 250
chunk_overlap = 50
text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap = chunk_overlap
)

In [6]:
splits = text_splitter.split_documents(docs)
print(f"Number of splits: {len(splits)}")

Number of splits: 127


In [7]:
# Print the first few chunks to verify
for i, chunk in enumerate(splits[:2]):
    print(f"Chunk {i + 1}:\n{chunk.page_content}\n")

Chunk 1:
Revisiting OPRO: The Limitations of Small-Scale LLMs as Optimizers
Tuo Zhang∗Jinyue Yuan∗and Salman Avestimehr
University of Southern California
{tuozhang, jinyueyu, avestime}@usc.edu
Abstract
Numerous recent works aim to enhance the

Chunk 2:
Abstract
Numerous recent works aim to enhance the
efficacy of Large Language Models (LLMs)
through strategic prompting. In particular, the
Optimization by PROmpting (OPRO) approach
provides state-of-the-art performance by lever-



## Create a vector database and store embeddings

In [8]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
embedding = OpenAIEmbeddings()

In [9]:
# sample_text = "This is a test sentence."
# sample_embedding = embedding.embed_documents([sample_text])
# print(sample_embedding)

In [10]:
# remove the previosuly created vector store
persist_directory = 'docs/chroma/'
!rm -rf ./docs/chroma

In [11]:
print(f"Number of Document Splits:{len(splits)}")
# print("splits::", splits)

Number of Document Splits:127


In [12]:
from langchain.vectorstores import Chroma
persist_directory = 'docs/chroma/'
!rm -rf ./docs/chroma  # remove old database files if any

In [13]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [14]:
print(f"Number of documents in my vector DB: {vectordb._collection.count()}")

Number of documents in my vector DB: 127


In [15]:
# print(vectordb.get(include=['embeddings', 'documents', 'metadatas']))

In [16]:
# question = "what is the motivation for OPRO?"
question = "What are the key limitations of small-scale language models like LLaMa when used as optimizers for automated prompt engineering techniques like OPRO?"

In [17]:
results = vectordb.similarity_search_with_score(question,k=3)

In [18]:
print(results)

[(Document(page_content='scorer in OPRO’s context, leading to suboptimal\nprompt generation. As a result, due to the limited\ninference ability, small-scale LLMs could not sup-\nport self-optimization for prompting paradigms.\nHuman-Crafted Elements and Their Impacts.', metadata={'page': 3, 'source': '2405.10276v1.pdf'}), 0.2309589385986328), (Document(page_content='paper, we revisit OPRO for automated prompt-\ning with relatively small-scale LLMs, such as\nLLaMa-2 family and Mistral 7B . Our inves-\ntigation reveals that OPRO shows limited ef-\nfectiveness in small-scale LLMs, with limited', metadata={'page': 0, 'source': '2405.10276v1.pdf'}), 0.24007278680801392), (Document(page_content='Abstract\nNumerous recent works aim to enhance the\nefficacy of Large Language Models (LLMs)\nthrough strategic prompting. In particular, the\nOptimization by PROmpting (OPRO) approach\nprovides state-of-the-art performance by lever-', metadata={'page': 0, 'source': '2405.10276v1.pdf'}), 0.2474881559

In [19]:
results = vectordb.similarity_search(question,k=3)

for result in results:
    print(f"{result.page_content}")

scorer in OPRO’s context, leading to suboptimal
prompt generation. As a result, due to the limited
inference ability, small-scale LLMs could not sup-
port self-optimization for prompting paradigms.
Human-Crafted Elements and Their Impacts.
paper, we revisit OPRO for automated prompt-
ing with relatively small-scale LLMs, such as
LLaMa-2 family and Mistral 7B . Our inves-
tigation reveals that OPRO shows limited ef-
fectiveness in small-scale LLMs, with limited
Abstract
Numerous recent works aim to enhance the
efficacy of Large Language Models (LLMs)
through strategic prompting. In particular, the
Optimization by PROmpting (OPRO) approach
provides state-of-the-art performance by lever-


In [20]:
# code to be used later

In [21]:
# Clean and preprocess the text if necessary
def clean_text(text):
    import re
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# cleaned_text = clean_text(text)