# RAG using LlamaIndex and OPENAI

In [None]:
# Install any necessary libraries
# !pip install langchain_community
# !pip install langchain_openai
# !pip install llama-index-llms-langchain
# !pip install python-dotenv
# !pip install llama_index
# !pip install llama-index-embeddings-huggingface
# !pip install llama-index-llms-huggingface

In [2]:
# Load required libraries
from llama_index.core import SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
import os
# Load OPENAI key
load_dotenv("api_key.env")

True

In [3]:
# To read the pdfs in current folder
loader = SimpleDirectoryReader(
    input_dir=".",
    recursive=True,
    required_exts=[".pdf"],
)

# Load the documents
documents = loader.load_data()
documents



[Document(id_='d03bac36-9967-4311-9bef-709d00c3c237', embedding=None, metadata={'page_label': '1', 'file_name': 'Data Scientist, NLP and LLM.pdf', 'file_path': '/content/Data Scientist, NLP and LLM.pdf', 'file_type': 'application/pdf', 'file_size': 112541, 'creation_date': '2025-03-11', 'last_modified_date': '2025-03-11'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Shraddha Piparia, Ph.D. Computational Biologist, Richland, WA | 940-297-9424 | spiparia@health.ucsd.edu Professional Experience Postdoctoral Research Associate | 2021-Present | University of California, San Diego  • Developed a robust ML pipeline combining pediatric chest r

In [None]:
# Load (or create) the embedding model for vector storage
# "BAAI/bge-small-en-v1.5" is a small English embedding model that encodes text into a vector space for similarity-based retrieval.
embedding_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

print(embedding_model._model.device)  # Device that the model is running on

In [5]:
# Creates embeddings for the sentences and stores them
index = VectorStoreIndex.from_documents(
    documents,
    embed_model=embedding_model,
)

# Save the index in the current directory
index.storage_context.persist(persist_dir="./huggingfaceembeddings")


In [6]:
# 1. EMBEDDING MODEL & INDEX PERSISTENCE
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# Load the existing index from a persisted folder
storage_context = StorageContext.from_defaults(persist_dir="./huggingfaceembeddings")
index = load_index_from_storage(storage_context, embed_model=embedding_model)

In [7]:
# Viewing the chunks
for doc in index.docstore.docs.values():
    print("Document ID:", doc.ref_doc_id)
    print("Text Chunk:", doc.text)
    print("=" * 50)


Document ID: d03bac36-9967-4311-9bef-709d00c3c237
Text Chunk: Shraddha Piparia, Ph.D. Computational Biologist, Richland, WA | 940-297-9424 | spiparia@health.ucsd.edu Professional Experience Postdoctoral Research Associate | 2021-Present | University of California, San Diego  • Developed a robust ML pipeline combining pediatric chest radiograph impressions for COVID-19 diagnosis (F1=0.79, Accuracy=80.65%), identified key radiological predictors using explainability methods (SHAP, GINI), and validated variant-specific performance.  • Led pharmacogenetic analyses linking two SNPs (rs3127412, rs37972) with ICS response in high-eosinophil asthmatics, demonstrating significantly enhanced lung function improvements (>12% FEV1, AUC=0.72 vs. 0.54 in low eosinophils) and underscoring the value of personalized medicine for targeted asthma management. Application Developer | 2013-2016 | Oracle India Private Limited | Telangana, India • Developed a sentiment classification model that analyzed tasks

In [8]:
os.environ["OPENAI_API_KEY"] = os.getenv("api_key")

# 2. LOADING THE OPENAI LLM
openai_llm = ChatOpenAI(temperature=0.7, model_name="gpt-4o-mini")

# 3. BUILD A QUERY ENGINE FOR RAG
# RAG (Retrieval-Augmented Generation) means the query engine will first retrieve relevant text chunks
# from the index, then feed them to the openAI LLM to produce a context-aware answer.
query_engine = index.as_query_engine(llm=openai_llm)

In [9]:
# Set the LLM to use
while True:
    question = input("Question: ")
    if question.lower() == "quit":
        break
    print(query_engine.query(question).response)

Question: whats the document about?
The document provides a detailed overview of Shraddha Piparia's professional background, highlighting her experience, technical skills, education, and select publications. It outlines her current role as a Postdoctoral Research Associate at the University of California, San Diego, where she focuses on machine learning applications in healthcare, particularly related to COVID-19 diagnosis and pharmacogenetic analyses in asthma. Additionally, it describes her previous experience as an Application Developer at Oracle India, where she worked on sentiment classification models and enterprise applications. The document also lists her educational qualifications, including a Ph.D. in Computer Science, and presents a selection of her published research in related fields.
Question: summarize the expertise
The expertise includes computational biology with a focus on machine learning and natural language processing. Key accomplishments involve developing ML pipe