In [1]:
import os
import shutil
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# Paths
pdf_path = "../data/policy/Company_Retention_Policy_2026.pdf"
db_path = "../vectorstore/chroma_db_test" # Temporary test DB

print(f"Checking file: {os.path.exists(pdf_path)}")

Checking file: True


In [2]:
# 1. Load the PDF
loader = PyPDFLoader(pdf_path)
docs = loader.load()
print(f"‚úÖ Loaded {len(docs)} pages.")

# 2. Split into chunks (AI reads better in small pieces)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)
splits = text_splitter.split_documents(docs)

print(f"üß© Split into {len(splits)} chunks.")
# Show a sample chunk to ensure it read text correctly
print(f"\n--- Sample Chunk ---\n{splits[0].page_content[:200]}...")

‚úÖ Loaded 1 pages.
üß© Split into 4 chunks.

--- Sample Chunk ---
CONFIDENTIAL  -  INTERNAL  USE  ONLY  Company  Retention  Policy  2026   1.  Retention  Principles  Our  goal  is  to  retain  valuable  customers  while  minimizing  costs.  Agents  must  assess  "Ch...


In [3]:
# 3. Initialize Embedding Model (The Translator: Text -> Numbers)
# Using 'all-MiniLM-L6-v2' (Small, Fast, Free)
embedding_fn = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 4. Create Vector Database
# Clear old test DB if it exists
if os.path.exists(db_path):
    shutil.rmtree(db_path)

vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embedding_fn,
    persist_directory=db_path
)
print("üíæ Test Vectorstore created successfully!")

üíæ Test Vectorstore created successfully!


In [4]:
# 5. Ask a question to prove it works
query = "What is the maximum discount for a high risk customer?"

# Search the DB for the 2 most relevant chunks
results = vectorstore.similarity_search(query, k=2)

print(f"‚ùì Question: {query}\n")
print("--- üí° Retrieved Answer from Policy ---")
for i, res in enumerate(results):
    print(f"\n[Result {i+1}]")
    print(res.page_content)

‚ùì Question: What is the maximum discount for a high risk customer?

--- üí° Retrieved Answer from Policy ---

[Result 1]
2.  Low  Risk  Customers  (Risk  Score  <  0.5)  -  Standard  Action:  Send  a  "We  miss  you"  email.  -  Allowed  Offer:  5%  discount  on  next  stay.  -  No  free  upgrades  allowed  without  manager  approval.   3.  High  Risk  Customers  (Risk  Score  >=  0.7)  -  Standard  Action:  Immediate  intervention  required.  -  Allowed  Offer:  Up  to  20%  discount  allowed  immediately.  -  Value  Add:  Free  breakfast  or  room  upgrade  (up  to  Deluxe)  is  permitted  to  save  the

[Result 2]
credits.
  4.  Manager  Approval  (Human  in  the  Loop)  -  Any  discount  >  20%  requires  human  approval.  -  Any  upgrade  to  "Presidential  Suite"  requires  human  approval.
