In [1]:
from langchain_community.document_loaders import PyMuPDFLoader 
import pprint
from markitdown import MarkItDown 
from langchain_community.document_loaders import TextLoader 
from langchain_text_splitters import MarkdownTextSplitter   
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma 
from langchain_mistralai import ChatMistralAI 
from langchain_core.messages import HumanMessage 
from langchain_core.prompts import ChatPromptTemplate

In [2]:
# PyMuPDFLoader library helps load and read PDF files page by page.
# Used to extract the text from each page of your PDF document.
loader = PyMuPDFLoader(
    "student_hand_book_2024_25_10.pdf",
    mode="page",
)
docs = loader.load()
print(len(docs))

143


In [3]:
# Pretty Print (pprint) makes printed output look neat and readable.
# Purpose: Used to display document content and results in a clean format.
pprint.pp(docs[0].page_content)

'1 | P a g e'


In [4]:
for i, doc in enumerate(docs):
    print(f"\n===== PAGE {i+1} =====")
    print(doc.page_content)


===== PAGE 1 =====
1 | P a g e

===== PAGE 2 =====
2 | P a g e  
 
Table of Contents 
Namal University at a Glance .................................................................................................. 11 
Background ..................................................................................................................... 11 
Namal Knowledge City: An Inspiration, A Dream ........................................................... 11 
Board of Governors ........................................................................................................ 12 
Vision and Mission of the Namal University: Not Just Statements ................................ 13 
Namal Core Values .......................................................................................................... 13 
Academic Conduct expected from the Namalites .................................................................. 13 
Academic Calendar 2024-25.......................................

In [5]:
with open("pdf_content.txt", "w", encoding="utf-8") as f:
    for doc in docs:
        f.write(doc.page_content + "\n")

MarkItDown
Converts documents (like PDFs or text files) into Markdown format.
Used to convert the extracted PDF text (pdf_content.txt) into a Markdown file.

In [6]:
markitdown = MarkItDown()
result = markitdown.convert("pdf_content.txt")

In [7]:
# Access the text part
markdown_text = result.text_content

In [8]:
# Now write it to a file
with open("pdf_content.md", "w", encoding="utf-8") as f:
    f.write(markdown_text)

TextLoader
Loads text files so they can be processed like documents.
Used to load the converted Markdown file for further splitting.

In [None]:
loader = TextLoader("pdf_content.md", encoding="utf-8")
docs = loader.load()

MarkdownTextSplitter

Splits large Markdown documents into smaller chunks of text.
It helps divide the document into parts that can be embedded easily and stored in a vector database.

In [9]:
splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=100)
split_docs = splitter.split_documents(docs)
print(f"Total chunks created: {len(split_docs)}")
pprint.pp(split_docs[2].page_content)

Total chunks created: 294
('Academic Calendar '
 '2024-25.................................................................................................... '
 '14 \n'
 'Academic Departments '
 '........................................................................................................... '
 '16 \n'
 '4. \n'
 'Department of Computer Science '
 '................................................................................... '
 '16 \n'
 'Programme Offered: '
 '....................................................................................................... '
 '16 \n'
 'Scheme of '
 'Studies........................................................................................................... '
 '17 \n'
 'Faculty profile '
 '................................................................................................................. '
 '23 \n'
 'Department of Electrical Engineering '
 '........................................................................

HuggingFaceEmbeddings
Converts text into numerical vectors (embeddings) for similarity search.
Used to turn each chunk of the document into embeddings for storage and retrieval.

In [10]:
# Load BGE-base English model
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

  embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")


In [11]:
texts = [doc.page_content for doc in split_docs]
vectors = embeddings.embed_documents(texts)
print(f"Total chunks: {len(vectors)}")
print(f"Embedding dimension: {len(vectors[0])}")
print(f"First chunk sample:\n{texts[0][:200]}")
print(f"First vector sample:\n{vectors[0][:5]}")

Total chunks: 294
Embedding dimension: 768
First chunk sample:
1 | P a g e
First vector sample:
[0.0012301248498260975, 0.03966820612549782, -0.0008271836559288204, 0.0033240539487451315, 0.046648215502500534]


Chroma

A local database that stores and retrieves text embeddings efficiently.
Used to create and save a searchable database of the document chunks.

In [12]:
db = Chroma.from_texts(
    texts=texts,              
    embedding=embeddings,        
    persist_directory="./chroma_db_bge"  
)

In [13]:
db.persist()

  db.persist()


In [14]:
print("✅ ChromaDB created and saved successfully!")
print("Total documents stored:", db._collection.count())

✅ ChromaDB created and saved successfully!
Total documents stored: 553


In [15]:
retriever = db.as_retriever(search_kwargs={"k": 3})
results = retriever.invoke("What is this document about?")
for i, doc in enumerate(results):
    print(f"\n--- Chunk {i+1} ---\n{doc.page_content[:300]}")


--- Chunk 1 ---
Plagiarism .................................................................................................................... 74
Other Forms of Academic Misconduct ....................................................................... 74
Student Protocol during Examinations ......................

--- Chunk 2 ---
Plagiarism .................................................................................................................... 74 
Other Forms of Academic Misconduct ....................................................................... 74 
Student Protocol during Examinations ....................

--- Chunk 3 ---
 To issue the Transcripts, Provisional Certificates, and degrees to the candidates.
 To provide adequate information on all examination matters and to bring to the notice
of the authorities any infringement of the Regulations pertaining to the examination
and ensure that the decision of the BoG an


ChatMistralAI

A powerful AI model that answers questions using given text context.
Used to generate intelligent answers based on the document’s content.

In [16]:
llm = ChatMistralAI(
    model="mistral-large-latest",
    mistral_api_key="ntieqofTshmKXs2Njl9ImaHZDyanjIwS"
)

In [21]:
query = "Can you provide an overview of this student notebook that what topics it covers?"
relevant_docs = retriever.invoke(query)

ChatPromptTemplate

Helps structure and format prompts for the AI model.
Used to create a clear question-answer format for the chatbot.

In [22]:
prompt = ChatPromptTemplate.from_template("""
You are an intelligent assistant.
Use the following context to answer the question.
If the answer is not in the context, say "I don’t know".

Context:
{context}

Question: {question}
""")

In [23]:
context = "\n\n".join([doc.page_content for doc in relevant_docs])
formatted_prompt = prompt.format(context=context, question=query)

HumanMessage

Wraps the user’s message so it can be processed by the AI model.
Used to send the formatted prompt (context + question) to the model.

In [24]:
response = llm.invoke([HumanMessage(content=formatted_prompt)])
print(response.content)

Based on the provided context, this student notebook (or course catalog/excerpt) covers advanced **Electrical and Electronics Engineering (EEN)** courses, organized into three main **specialization areas**:

1. **Wireless Communication & Computer Networks**
   - Wireless Communication (EEN-326)
   - Data Communication (EEN-471)
   - Operating Systems (EEN-473)
   - Network Security (EEN-474)
   - Antenna Theory and Design (EEN-475)
   - Computer Communication Networks (EEN-476)
   - Cloud Computing (EEN-477)

2. **Power (Systems) Engineering**
   - Renewable Energy Systems (EEN-480)
   - High Voltage Engineering (EEN-481)
   - Power System Analysis (EEN-482)
   - Power System Protection (EEN-483)
   - Smart Grid Systems (EEN-484)
   - Power System Operation and Control (EEN-485)
   *(Prerequisite: EEN-222 for all power courses)*

3. **Integrated Circuits and Electronics**
   - VLSI Design (EEN-490)
   - Integrated Circuit Design (EEN-491)
   - Digital System Design (EEN-492)
   - Indus