Define embeddings

In [None]:
# Huggingface embeddings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-base-en-v1.5",
    model_kwargs={
        "torch_dtype": "float16"
        },
    device="cuda",
    embed_batch_size=16,
    normalize=True,
    parallel_process=True,
    show_progress_bar=True
)

Define LLM

In [None]:
# Define LLM
from llama_index.llms.openai import OpenAI
llm = OpenAI(
    model = "gpt-5-mini-2025-08-07",
    temperature = 0.6,
)

Connect to and Initialize PostgreSQL Database

In [None]:
import psycopg2
db_name="resume_db"
host = "localhost"
password = "123456"
port = "5432"
user = "postgres"

conn = psycopg2.connect(
    dbname="postgres",
    user=user,
    password=password,
    host=host,
    port=port
)
conn.autocommit = True

with conn.cursor() as cursor:
    cursor.execute(f"DROP DATABASE IF EXISTS {db_name};")
    cursor.execute(f"CREATE DATABASE {db_name};")

In [None]:
from sqlalchemy import make_url
from llama_index.vector_stores.postgres import PGVectorStore

vector_store = PGVectorStore.from_params(
    database=db_name,
    host=host,
    port=port,
    user=user,
    password=password,
    table_name="resume_table",
    embed_dim=768
)

Use PyMuPDF to read data

In [None]:
from pathlib import Path
from llama_index.readers.file import PyMuPDFReader
import textwrap


loader = PyMuPDFReader()
pdf_dir = Path("C:\\LlamaIndex\\Resume-Creator\\data")
pdf_files = list(pdf_dir.glob("*.pdf"))  # Get all PDF files in the directory

documents = []
for pdf_file in pdf_files:
    docs = loader.load(file_path=str(pdf_file))
    documents.extend(docs)

textwrap.wrap(documents[0].text)[0:50]

Split text

In [None]:
from llama_index.core.node_parser import SentenceSplitter
text_parser = SentenceSplitter(
    chunk_size=512,
    chunk_overlap=50,
)

In [None]:
text_chunks = []
doc_idxs = []
for doc_idx, doc in enumerate(documents):
    cur_text_chunks = text_parser.split_text(doc.text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

Construct nodes

In [None]:
from llama_index.core.schema import TextNode
nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc = documents[doc_idxs[idx]]
    node.metadata = src_doc.metadata
    nodes.append(node)

Generate embeddings

In [None]:
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

Load nodes into vector store

In [None]:
# Remove NUL characters from node text before adding to vector store
for node in nodes:
	if hasattr(node, "text"):
		node.text = node.text.replace('\x00', '')

vector_store.add(nodes)

Generate a Query Embedding

In [None]:
query_str = "What is the experience at Battelle?"

query_embedding = embed_model.get_query_embedding(query_str)

Query the Vector database

In [None]:
from llama_index.core.vector_stores import VectorStoreQuery

query_mode = "default"

vector_store_query = VectorStoreQuery(
    query_embedding=query_embedding, mode=query_mode
)

In [None]:
query_result = vector_store.query(vector_store_query)
print(query_result.nodes)

In [None]:
import src.prompts as prompts