In [1]:
import os
import time
import numpy as np
import nltk
import polars as pl
from pypdf import PdfReader
from google import genai
from google.genai import types
#from google import generativeai as genai

In [2]:
client = genai.Client(api_key= os.getenv("API_KEY"))

To test API key is working or not

In [3]:
response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents = "Hello,who am I talking to?"
)
print(response.text)

I am a large language model, trained by Google.



Extract sentences from PDF

In [4]:
nltk.download('punkt')
def extract_sentences_from_pdf(pdf_path, start=6, end=124):
    """Extract sentences from PDF with metadata (page number & sentence index)."""
    reader = PdfReader(pdf_path)
    sentences_with_meta = []

    for i in range(max(0, start - 1), min(end, len(reader.pages))):
        page_text = reader.pages[i].extract_text() or "" 
        sentences = nltk.sent_tokenize(page_text) # Splits the text into idividual sentences

        for idx, sentence in enumerate(sentences):
            sentences_with_meta.append({"text": sentence, "meta": {"type": "sentence", "page": i + 1, "index": idx}})

    return sentences_with_meta

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\priya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Creating Embedding for Text Chunks

In [5]:
def create_embeddings(text, model="models/embedding-001", task_type="SEMANTIC_SIMILARITY"):
    """Generate embeddings for a given text with error handling."""
    try:
        response = client.models.embed_content(
            model=model,
            contents=text,
            config=types.EmbedContentConfig(task_type=task_type)
        )
        return response.embeddings[0].values  # Return vector directly
    except Exception as e:
        print(f"Embedding Error: {e}")
        return np.zeros(768)  # Fallback vector to prevent failures


Cosine Similarity

In [6]:
def cosine_similarity(vec1, vec2):
    """Compute cosine similarity between two vectors."""
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

VectorStore

In [7]:
class VectorStore:
    """Store embeddings and metadata for retrieval."""
    def __init__(self):
        self.vectors = []
        self.texts = []
        self.metadata = []

    def add(self, text, vector, meta):
        """It allows the class to grow dynamically as new embeddings are added"""
        self.vectors.append(np.array(vector))
        self.texts.append(text)
        self.metadata.append(meta)

    def semantic_search(self, query_vector, k=10):
        """Retrieve the top-k most relevant sentences using semantic search."""
        scores = [(i, cosine_similarity(query_vector, v)) for i, v in enumerate(self.vectors)]
        scores.sort(key=lambda x: x[1], reverse=True)
        return [{"text": self.texts[i], "meta": self.metadata[i]} for i, _ in scores[:k]]

    def save(self, file_path):
        """Save vectors, texts, and metadata."""
        df = pl.DataFrame({"vectors": self.vectors, "texts": self.texts, "metadata": self.metadata})
        df.write_parquet(file_path)

    def load(self, file_path):
        """Load stored embeddings and metadata."""
        df = pl.read_parquet(file_path)
        self.vectors = df["vectors"].to_list()
        self.texts = df["texts"].to_list()
        self.metadata = df["metadata"].to_list()


Generate Answer

In [8]:
def generate_answer(query, matched_sentences):
    """Generate a response using retrieved contextual sentences."""
    if not matched_sentences:
        return "I don't know."

    context = "\n".join([entry["text"] for entry in matched_sentences])
    system_prompt = (
        "You are a helpful assistant. Use the provided context to answer the user's question.\n"
        "If the answer is clearly stated or implied in the context, provide it concisely.\n"
        "If it's not found in the context, say 'I don't know.' Do not make up information."
    )

    try:
        response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=f"Question: {query}\n\nContext:\n{context}",
            config=types.GenerateContentConfig(system_instruction=system_prompt) #ensure AI follows specific rules when answering
        )
        return response.text.strip()
    except Exception as e:
        print(f"Error generating answer: {e}")
        return "Error: Unable to generate response."


Evaluate Answer

In [9]:
def evaluate_answer(query, ai_answer, ideal_answer):
    system_prompt = (
        "You are an evaluation system.\n"
        "Score the assistant's answer as follows:\n"
        "- 1 if it is correct and complete\n"
        "- 0.5 if it is partially correct\n"
        "- 0 if it is incorrect or missing\n"
        "Also provide a brief justification."
    )
    eval_prompt = f"Question: {query}\nAI Answer: {ai_answer}\nIdeal Answer: {ideal_answer}"
    
    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=[eval_prompt],
        config=types.GenerateContentConfig(system_instruction=system_prompt, temperature=0.4)
    )
    return response.text.strip()

Vector_store("embeddings.paraquet")

In [10]:
def save_vector_store(store, file_path="embeddings.parquet"):
    """Save embeddings for reuse."""
    store.save(file_path)
    print(f"Saved vector store to {file_path}")


In [11]:
def load_vector_store(file_path="embeddings.parquet"):
    """Load stored embeddings instead of regenerating them."""
    store = VectorStore()
    store.load(file_path)
    print(f"Loaded vector store from {file_path}")
    return store


In [12]:
import polars as pl

# Load the Parquet file
df = pl.read_parquet("embeddings.parquet")

# Display the first few rows
print(df.head())


shape: (5, 3)
┌─────────────────────────────────┬─────────────────────────────────┬──────────────────┐
│ vectors                         ┆ texts                           ┆ metadata         │
│ ---                             ┆ ---                             ┆ ---              │
│ array[f64, 768]                 ┆ str                             ┆ struct[3]        │
╞═════════════════════════════════╪═════════════════════════════════╪══════════════════╡
│ [0.005048, -0.057885, … 0.0119… ┆ Introduction                    ┆ {"sentence",6,0} │
│                                 ┆ Fundamental traff…              ┆                  │
│ [-0.003776, -0.061707, … -0.01… ┆ Be considerate of those using … ┆ {"sentence",6,1} │
│ [0.017079, -0.053181, … 0.0257… ┆ Be especially considerate of c… ┆ {"sentence",6,2} │
│ [-0.009283, -0.038325, … 0.028… ┆ Do not cause any unnecessary d… ┆ {"sentence",6,3} │
│ [0.020128, 0.005779, … -0.0170… ┆ No-one has any rights, only jo… ┆ {"sentence",6,4} │
└──────

In [13]:
if __name__ == "__main__":
    pdf_path = "Driving_theory_book_2025.pdf"

    print("Extracting sentences from PDF...")
    sentences = extract_sentences_from_pdf(pdf_path)

    print("Creating embeddings for sentences...")
    store = VectorStore()

    for i, entry in enumerate(sentences):
        emb = create_embeddings(entry["text"])
        store.add(entry["text"], emb, entry["meta"])
        time.sleep(0.5)  # Reduced delay for faster execution

    save_vector_store(store)

    vector_store = load_vector_store("embeddings.parquet")  # Load stored embeddings
    print("Vector store successfully loaded!")  # Confirm it works
    print(f"Total stored vectors: {len(vector_store.vectors)}") # Check stored embeddings count

    # Sample query
    question = "What is the rule at an uncontrolled intersection?"
    query_vector = create_embeddings(question)

    print("\nLoading saved vector store for search...")
    store = load_vector_store()

    top_sentences = store.semantic_search(query_vector, k=10)

    print("\nTop matched sentences for context:")
    for s in top_sentences:
        print(f"- {s['text']} (Page {s['meta']['page']}, Sentence {s['meta']['index']})")

    answer = generate_answer(question, top_sentences)

    print("\n=== Question ===")
    print(question)
    print("\n=== Answer ===")
    print(answer)

    ideal_answer = "You must yield to traffic coming from the right."

    print("\nEvaluation:")
    print(evaluate_answer(question, answer, ideal_answer))


Extracting sentences from PDF...
Creating embeddings for sentences...
Saved vector store to embeddings.parquet
Loaded vector store from embeddings.parquet
Vector store successfully loaded!
Total stored vectors: 1023

Loading saved vector store for search...
Loaded vector store from embeddings.parquet

Top matched sentences for context:
- This means that the pedestrian crossing is uncontrolled. (Page 47, Sentence 4)
- An uncontrolled pedestrian crossing. (Page 47, Sentence 0)
- Uncontrolled pedestrian crossings
Drivers have an obligation to give way to pedestrians who have stepped
out onto the pedestrian crossing or who are about to do so. (Page 46, Sentence 12)
- It is absent from most intersections where the priority-to-the-right
rule applies. (Page 32, Sentence 2)
- Intersecting traffic have a red light,
but oncoming traffic might have a
green light. (Page 41, Sentence 1)
- If there is no stop line, stop just
before entering the intersecting road. (Page 24, Sentence 5)
- Driver A mus

Text based Chatbot(ASk question)

In [14]:
store = load_vector_store("embeddings.parquet")  # Load previously saved vectors

# Ask one question per cell execution
question = input("Ask a question (or type 'exit' to quit): ")
if question.lower() != "exit":
    query_vector = create_embeddings(question)
    top_sentences = store.semantic_search(query_vector, k=5)
    answer = generate_answer(question, top_sentences)

    print("\nQuestion:\n", question)
    print("\nAnswer:\n", answer)
    print("\nContext Sentences:")
    for s in top_sentences:
        print(f"- {s['text']} (Page {s['meta']['page']}, Sentence {s['meta']['index']})")
        


Loaded vector store from embeddings.parquet

Question:
 What is the rule for uncontrolled pedestrian crossing?

Answer:
 Drivers must yield to pedestrians who have already stepped onto the uncontrolled pedestrian crossing or are about to do so.

Context Sentences:
- This means that the pedestrian crossing is uncontrolled. (Page 47, Sentence 4)
- An uncontrolled pedestrian crossing. (Page 47, Sentence 0)
- Uncontrolled pedestrian crossings
Drivers have an obligation to give way to pedestrians who have stepped
out onto the pedestrian crossing or who are about to do so. (Page 46, Sentence 12)
- Controlled pedestrian crossings
Have functioning traffic signals (or a police officer). (Page 46, Sentence 8)
- Is it prohibited
to overtake the bus in conjunction with the pedestrian
crossing? (Page 105, Sentence 1)


Report

In [15]:
#Critical Reflection

# API Key Usage:
# This chatbot uses an API key stored as an environment variable (api_key= os.getenv("API_KEY")) for authentication.
# This method retrieves the API key from environment variables to authenticate the request securely.
# The API key is required for communicating with external services (e.g., AI models, vector databases).

# Real-World Application:
# This chatbot acts as an AI-powered study assistant specifically for Swedish driving theory learners.
# It retrieves information directly from the "Introduction" section of the 2025 Driving Theory Book,
# ensuring learners get precise and structured explanations about foundational driving concepts.

# Challenges & Opportunities
# Business Perspective:
# Can be integrated into official driving school platforms as a self-study tool.
# Provides automated assistance, reducing the need for manual responses from instructors.
# Could evolve into a paid application for learners preparing for driving exams.

# Ethical Perspective:
# Users must be aware that it is an educational tool, not a legal authority on driving laws.
# Since only the introduction section is covered, learners must verify details from the full book.
# Bias in responses should be monitored to ensure clear, unbiased driving guidance.

# Technical Perspective:
# The chatbot relies on sentence-based chunking, embeddings, and semantic search to retrieve relevant details.
# Vector storage allows efficient searching within the introduction section.
# Performance could improve by expanding coverage beyond the introduction for a full-driving theory assistant.

# Future Possibilities:
# Support for multi-chapter processing, expanding to cover all sections of the driving theory book including image.
# Voice input integration to make it accessible as an interactive AI driving tutor.
# Currently, the chatbot focuses only on introductory driving concepts, but future improvements could transform it into a
# fully functional AI tutor for learners preparing for the driving exam.