In [5]:
!uv pip install langchain langchain-community langchain-google-genai pypdf pinecone python-dotenv

[2mUsing Python 3.13.5 environment at: chatbot[0m
[2K[2mResolved [1m70 packages[0m [2min 846ms[0m[0m                                        [0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/3)                                                   
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/3)-------------------[0m[0m     0 B/63.92 KiB           [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/3)-------------------[0m[0m 16.00 KiB/63.92 KiB         [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/3)[2m---------------[0m[0m 32.00 KiB/63.92 KiB         [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/3)[30m[2m----------[0m[0m 42.90 KiB/63.92 KiB         [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/3)-----[30m[2m----------[0m[0m 42.90 KiB/63.92 KiB    [1A
[2mpackaging                [0m [32m--------------------[30m[2m----------[0m[0m 42.90 KiB/63.92 KiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/3)--------

In [46]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import time

In [8]:
load_dotenv()

True

In [9]:
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [11]:
PDF_PATH = "handbook.pdf"
INDEX_NAME = "handbook-rag"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200

In [12]:
#loading pdf
loader = PyPDFLoader(PDF_PATH)
documents = loader.load()
print(f"Loaded {len(documents)} pages...")

Loaded 175 pages...


In [14]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = CHUNK_SIZE,
    chunk_overlap = CHUNK_OVERLAP,
    length_function = len,
    separators=["\n\n", " ", ""]
)

chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks...")

Created 368 chunks...


In [28]:
#show sample chunk
print(chunks[202].page_content + "...")

USTP Student Handbook 2023 Edition   84 
(e.g., transferring the 
tags/pins in a 
dissection specimen in 
Anatomy) to confuse, 
mislead or irritate the 
students and teacher. 
13. Smoking within the 
school premises, 
classrooms, laboratory, 
and other prohibited 
areas 
Reprimand
* and 
Communit
y Service 
(5 hours) 
3-day 
Suspensio
n 
Expulsio
n 
14. Other offenses 
analogous to those 
listed above. 
Reprimand
* and 
Communit
y Service 
(5 hours) 
3-day 
Suspensio
n 
Expulsio
n 
Reprimand shall be in the form of a formal letter signed by the OSA, and 
parents are copy-furnished. 
Section 3. Non-Academic Light Offenses 
Non-Academic Light 
Offenses 
1st 
Offense 
2nd 
Offense 
3rd 
Offens
e 
1. U
nauthorized staying in 
the University campus 
beyond 10:00 p.m. in 
violation of the 
student’s curfew as 
provided for in the 
University Code. 
Communit
y 
Service 
(3 hours) 
Communit
y 
Service 
(6 hours) 
No 
Entry 
into the 
Campus
* 
2. Not wearing the school 
ID and/or not using the

In [32]:
#chunk check
import random

# Sample 10 random chunks
sample_size = 10
random_chunks = random.sample(chunks, min(sample_size, len(chunks)))

for i, chunk in enumerate(random_chunks):
    print(f"\n=== CHUNK {i+1} ===")
    print(chunk.page_content[:500])
    print("=" * 50)


=== CHUNK 1 ===
USTP Student Handbook 2023 Edition   132 
i. unwelcome phone calls with sexual overtones 
causing discomfort, embarrassment, offense, or 
insult to the receiver; and 
j. other analogous cases. 
4. The administrative offense of work-related sexual 
harassment is further described in the following 
circumstances: 
a. Submission to or rejection of the act or series of 
acts is used as basis for any employment decision 
(including, but not limited to, matters related to 
hiring, promotion, raise in 

=== CHUNK 2 ===
Philippines, the foreign/international 
student, as part of the requirements for processing his/her entry, 
shall report immediately to the BI Students Desk for registration. 
Foreign/international students may enroll in USTP after 
complying with the following requirements: 
1. Must have complete and valid credentials 
2. Must comply with  all the prescribed registration 
requirements of the College/Program, such as: 
a. College Admission Test Result (passed) 

In [None]:
#setup gemini
embeddings = GoogleGenerativeAIEmbeddings(
    model="models/gemini-embedding-001",
    task_type="QUESTION_ANSWERING",
    google_api_key = GOOGLE_API_KEY
)

In [34]:
#setup pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

In [42]:
existing_indexes = [index.name for index in pc.list_indexes()]

if INDEX_NAME not in existing_indexes:
    pc.create_index(
        name= INDEX_NAME,
        dimension = 3072,
        metric = "cosine",
        spec = ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    while not pc.describe_index(INDEX_NAME).status['ready']:
        time.sleep(1)
    print("Index made.")
else:
    print(f"{INDEX_NAME} already exists.")

Index made.


In [43]:
index = pc.Index(INDEX_NAME)

In [44]:
batch_size = 100
for i in range(0, len(chunks), batch_size):
    batch = chunks[i:i + batch_size]

    vectors_to_upsert = []
    for j, chunk in enumerate(batch):
        chunk_id = f"chunk_{i+j}"
        chunk_text = chunk.page_content

        embedding = embeddings.embed_query(chunk_text)

        metadata = {
            "text": chunk_text,
            "page": chunk.metadata.get("page", 0),
            "source": chunk.metadata.get("source", "unknown")
        }

        vectors_to_upsert.append((chunk_id, embedding, metadata))

    index.upsert(vectors=vectors_to_upsert)
    print(f" Processing {min(i+batch_size, len(chunks))}/{len(chunks)} chunks")

print("\n All chunks uploaded successfully")

stats = index.describe_index_stats()
print(f"\n Index statistics:")
print(f"   Total vectors: {stats['total_vector_count']}")
print(f"   Index dimension: {stats['dimension']}")

 Processing 100/368 chunks
 Processing 200/368 chunks
 Processing 300/368 chunks
 Processing 368/368 chunks

 All chunks uploaded successfully

 Index statistics:
   Total vectors: 368
   Index dimension: 3072


In [53]:
#TEsting chat capability
llm = ChatGoogleGenerativeAI(
    model = "gemini-3-flash-preview",
    google_api_key = GOOGLE_API_KEY,
    temperature=0.3
)

In [54]:
TOP_K= 8
SYSTEM_PROMPT="""
You are a helpful student assistant for USTP (University of Science and Technology of Southern Philippines). Your role is to answer student questions based ONLY on the information provided from the official student handbook.

STRICT RULES:
1. Only use information from the Context provided below. DO NOT make up or infer information not explicitly stated.
2. If the Context does not contain enough information to answer the question, honestly say "I don't have that information in the handbook" or "The handbook doesn't specify that."
3. Always cite the relevant section when possible (e.g., "According to the handbook...")
4. Be friendly and helpful, but stay factually accurate to the handbook content.
5. If a question is ambiguous, ask for clarification rather than guessing.

Context from the student handbook:
{context}

---
Student Question: {question}

Answer:
"""

In [55]:
def query_rag(question: str, top_k:int = TOP_K):
    #generate question embedding
    question_emdedding = embeddings.embed_query(question)

    #query pinecone
    results = index.query(
        vector= question_emdedding,
        top_k=top_k,
        include_metadata= True
    )

    #extract retrieved chunks
    retrieved_chunks = []
    sources = []

    for i, match in enumerate(results['matches']):
        chunk_text = match['metadata']['text']
        page = match['metadata'].get('page', 'unknown')
        score =match['score']

        retrieved_chunks.append(chunk_text)
        sources.append({
            'page':page,
            'score': score,
            'text_preview': chunk_text[:200] + "..."
        })

        print(f"   ✓ Chunk {i+1}: Page {page}, Similarity: {score:.3f}")
    
    #combine chunks to context
    context = "\n\n---\n\n".join(retrieved_chunks)
    
    # create full prompt
    prompt = SYSTEM_PROMPT.format(context=context, question= question)

    #get llm answer
    response = llm.invoke(prompt)
    answer = response.content

    return {
        'answer': answer,
        'sources': sources,
        'context_used': context
    }


In [78]:
test_question = "If sulod akong gwa sa required grade for deans lister, automatic na ba na nga listed ko as one of the deans lister? And mag announce nalang ba dayon ang school with list of names sa mga deans lister? Or ang student mismo mo undergo ug process and magpasa ug papers para sa lista siya as one of the deans lister? I know this might be a dumb question but Im genuinely serious on asking this, I hope to read some enlightening comments. Thank you."
result = query_rag(test_question)
print(result['answer'][0]['text'])

   ✓ Chunk 1: Page 73, Similarity: 0.722
   ✓ Chunk 2: Page 75, Similarity: 0.666
   ✓ Chunk 3: Page 75, Similarity: 0.663
   ✓ Chunk 4: Page 74, Similarity: 0.643
   ✓ Chunk 5: Page 64, Similarity: 0.638
   ✓ Chunk 6: Page 71, Similarity: 0.638
   ✓ Chunk 7: Page 69, Similarity: 0.637
   ✓ Chunk 8: Page 64, Similarity: 0.636
Hello! That is a very valid question, and it's great that you're aiming for the Dean's List. Based on the **USTP Student Handbook 2023 Edition**, here is the information regarding your concern:

According to the handbook, the process for generating the Dean's List is handled by the university administration rather than through a student application. 

**How the list is generated:**
*   **Administrative Action:** According to page 62, "The Dean shall issue a letter requesting the Registrar to generate the list of students with a GPA of 1.75 or above." 
*   **Automatic Recognition:** The handbook does not state that a student needs to submit papers or undergo a spec