### Note!!!

This notebook requires atleast 50GB RAM and 16GB VRAM to run, ensure to have colab pro

In [1]:
!pip install -q PymuPDF
!pip install -q python-dotenv
!pip install -q bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m104.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m124.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m102.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import fitz
import numpy as np
import json
import re

from openai import OpenAI
from tqdm import tqdm

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, pipeline, AutoModelForCausalLM
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def gen_text_chunk(pdf_path, n, overlap):
  pdf = fitz.open(pdf_path)
  text = ""
  for page in pdf:
    text += page.get_text("text")

  chunk = [text[i: i+n] for i in range(0, len(text), n-overlap)]
  return chunk

text_chunks = gen_text_chunk("data/AI_Information.pdf", n=1000, overlap=200)
text_chunks[:5]

['Understanding Artificial Intelligence \nChapter 1: Introduction to Artificial Intelligence \nArtificial intelligence (AI) refers to the ability of a digital computer or computer-controlled robot \nto perform tasks commonly associated with intelligent beings. The term is frequently applied to \nthe project of developing systems endowed with the intellectual processes characteristic of \nhumans, such as the ability to reason, discover meaning, generalize, or learn from past \nexperience. Over the past few decades, advancements in computing power and data availability \nhave significantly accelerated the development and deployment of AI. \nHistorical Context \nThe idea of artificial intelligence has existed for centuries, often depicted in myths and fiction. \nHowever, the formal field of AI research began in the mid-20th century. The Dartmouth Workshop \nin 1956 is widely considered the birthplace of AI. Early AI research focused on problem-solving \nand symbolic methods. The 1980s saw

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
gen_tokenizer = AutoTokenizer.from_pretrained("unsloth/Meta-Llama-3.1-8B-Instruct")
gen_model = AutoModelForCausalLM.from_pretrained("unsloth/Meta-Llama-3.1-8B-Instruct",
    torch_dtype="auto",
    device_map="auto")


embed_model = AutoModel.from_pretrained("BAAI/bge-base-en").to(device)
embed_tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en")

In [25]:
def generate_questions(chunk, num_questions=5, model="unsloth/Meta-Llama-3.1-8B-Instruct"):
    """
    Generates relevant questions that can be answered from the given text chunk.

    Args:
    text_chunk (str): The text chunk to generate questions from.
    num_questions (int): Number of questions to generate.
    model (str): The model to use for question generation.

    Returns:
    List[str]: List of generated questions.
    """
    # Define the system prompt to guide the AI's behavior
    system_prompt = "You are an expert at generating relevant questions from text. Create concise questions that can be answered using only the provided text. Focus on key information and concepts."

    # Define the user prompt with the text chunk and the number of questions to generate
    user_prompt = f"""
    Based on the following text, generate {num_questions} different questions that can be answered using only this text:

    {chunk}

    Format your response as a numbered list of questions ONLY, with no additional text.
    """

    # Generate a response from the AI model based on the system prompt and text chunk
    text = gen_tokenizer.apply_chat_template(
        conversation=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        tokenize=False,
        add_generation_prompt=True
    )

    model_inputs = gen_tokenizer([text], return_tensors="pt").to(device)

    generated_ids = gen_model.generate(
        **model_inputs,
        # max_new_tokens=50,
        # temperature=0,
        do_sample=True
    )

    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = gen_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    print("===========================================")
    print(f"questions: \n{response}")
    print("===========================================")
    questions = []

    # Extract questions using regex pattern matching
    for line in response.split('\n'):
        # Remove numbering and clean up whitespace
        cleaned_line = re.sub(r'^\d+\.\s*', '', line.strip())
        if cleaned_line and cleaned_line.endswith('?'):
            questions.append(cleaned_line)

    return questions


In [27]:
generate_questions(text_chunks[2], 3)

questions: 
1. What is the primary focus of machine learning in artificial intelligence?
2. What type of data is used to train supervised learning algorithms?
3. What is an example of a common technique used in unsupervised learning?


['What is the primary focus of machine learning in artificial intelligence?',
 'What type of data is used to train supervised learning algorithms?',
 'What is an example of a common technique used in unsupervised learning?']

In [None]:
def create_embeddings(text):
    """
    Create embeddings for text using the loaded embedding model.

    Args:
        text: str or list of str - input text(s) to embed

    Returns:
        list[np.ndarray]: list of normalized embeddings, each of shape (dim,)
    """
    # Handle single string input
    is_single = isinstance(text, str)
    if is_single: text = [text]

    # Tokenize with error handling
    try:
        inputs = tokenizer(
            text,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(device)
    except Exception as e:
        print(f"Tokenization error: {e}")
        return None

    # Generate embeddings with no gradient computation
    try:
        with torch.no_grad():
            output = model(**inputs)
            # Use CLS token embedding [CLS] at position 0
            cls_emb = output.last_hidden_state[:, 0, :]
            # L2 normalize for cosine similarity
            emb_normalized = F.normalize(cls_emb, p=2, dim=1)

        # Convert to list of np.ndarray, each of shape (dim,)
        embeddings = [emb.cpu().numpy() for emb in emb_normalized]

        # Return single embedding if input was single string
        return embeddings

    except Exception as e:
        print(f"Embedding generation error: {e}")
        return None

In [None]:
class SimpleVectorStore:
    """
    A simple vector store implementation using NumPy.
    """
    def __init__(self):
        """
        Initialize the vector store.
        """
        self.vectors = []
        self.texts = []
        self.metadata = []

    def add_item(self, text, embedding, metadata=None):
        """
        Add an item to the vector store.

        Args:
        text (str): The original text.
        embedding (List[float]): The embedding vector.
        metadata (dict, optional): Additional metadata.
        """
        self.vectors.append(embedding)
        self.texts.append(text)
        self.metadata.append(metadata or {})

    def similarity_search(self, query_embedding, k=5):
        """
        Find the most similar items to a query embedding.

        Args:
        query_embedding (List[float]): Query embedding vector.
        k (int): Number of results to return.

        Returns:
        List[Dict]: Top k most similar items with their texts and metadata.
        """
        if not self.vectors:
            return []

        # Convert query embedding to numpy array
        query_vector = query_embedding

        # Calculate similarities using cosine similarity
        similarities = []
        for i, vector in enumerate(self.vectors):
            similarity = cosine_similarity(query_vector, vector)
            similarities.append((i, similarity))

        # Sort by similarity (descending)
        similarities.sort(key=lambda x: x[1], reverse=True)

        # Return top k results
        results = []
        for i in range(min(k, len(similarities))):
            idx, score = similarities[i]
            results.append({
                "text": self.texts[idx],
                "metadata": self.metadata[idx],
                "similarity": score
            })

        return results


In [None]:
vector_store = SimpleVectorStore()

print("Processing chunks and generating questions...")
for i, chunk in enumerate(tqdm(text_chunks, desc="Processing Chunks")):
    # Create embedding for the chunk itself
    chunk_embedding = create_embeddings(chunk)[0].reshape(1, -1)

    # Add the chunk to the vector store
    vector_store.add_item(
        text=chunk,
        embedding=chunk_embedding,
        metadata={"type": "chunk", "index": i}
    )

    # Generate questions for this chunk
    questions = generate_questions(chunk, num_questions=5)

    # Create embeddings for each question and add to vector store
    for j, question in enumerate(questions):
        question_embedding = create_embeddings(question)[0].reshape(1, -1)

        # Add the question to the vector store
        vector_store.add_item(
            text=question,
            embedding=question_embedding,
            metadata={"type": "question", "chunk_index": i, "original_chunk": chunk}
        )


Processing chunks and generating questions...


Processing Chunks:   2%|▏         | 1/42 [00:03<02:31,  3.69s/it]

questions: 
1. What is the primary ability of a digital computer or computer-controlled robot in the context of AI?
2. What intellectual processes are characteristic of humans that AI systems aim to replicate?
3. What event in 1956 is widely considered the birthplace of AI?
4. What area of focus did early AI research primarily concentrate on?
5. What period saw a rise in the use of new methods in AI research?


Processing Chunks:   5%|▍         | 2/42 [00:07<02:25,  3.63s/it]

questions: 
1. When is the Dartmouth Workshop considered the birthplace of AI?
2. What type of methods were early AI research focused on?
3. What advancements were seen in the 1990s and 2000s in AI?
4. What has recent breakthroughs in deep learning revolutionized in the field of AI?
5. What are some examples of AI systems that are increasingly prevalent in everyday life?


Processing Chunks:   7%|▋         | 3/42 [00:10<02:15,  3.48s/it]

questions: 
1. What is a key characteristic of machine learning algorithms?
2. What type of data is used to train supervised learning algorithms?
3. What is an example of supervised learning in the field of image classification?
4. What is a common technique used in unsupervised learning to group similar data points?
5. What type of data is used to train unsupervised learning algorithms?


Processing Chunks:  10%|▉         | 4/42 [00:14<02:13,  3.51s/it]

questions: 
1. What is one technique used in unsupervised machine learning to group similar data points?
2. What type of feedback does an agent in reinforcement learning receive in the form of?
3. What is the primary goal of reinforcement learning in an environment?
4. What is the main difference between deep learning and other machine learning techniques?
5. What is one area where deep learning has achieved significant breakthroughs?


Processing Chunks:  12%|█▏        | 5/42 [00:18<02:14,  3.65s/it]

questions: 
1. What is a key application of Convolutional Neural Networks (CNNs)?
2. What type of data do Recurrent Neural Networks (RNNs) process?
3. What is a key characteristic of Convolutional Neural Networks (CNNs)?
4. What is the primary focus of Natural Language Processing (NLP)?
5. What is one example of a task that Recurrent Neural Networks (RNNs) are suitable for?


Processing Chunks:  14%|█▍        | 6/42 [00:21<02:03,  3.44s/it]

questions: 
1. What is the main focus of Natural Language Processing (NLP)?
2. What are some tasks involved in Computer Vision?
3. What are some applications of AI in the healthcare industry?
4. What is one way AI-powered tools can assist in treatment in the healthcare industry?
5. What is one example of an application of Computer Vision?


Processing Chunks:  17%|█▋        | 7/42 [00:24<01:54,  3.29s/it]

questions: 
1. What are some examples of AI applications in medical diagnosis?
2. How do AI algorithms assist in financial processes?
3. What is one way AI is used in retail to improve the shopping experience?
4. What is the primary function of autonomous vehicles in transportation?
5. What is one type of AI-powered tool used in medical diagnosis?


Processing Chunks:  19%|█▉        | 8/42 [00:26<01:47,  3.15s/it]

questions: 
1. What are some uses of AI in the entertainment industry?
2. How do AI-powered systems improve the shopping experience?
3. What are some benefits of using AI in manufacturing?
4. What type of learning platforms is AI enhancing in the education sector?
5. How do AI algorithms analyze user preferences in the entertainment industry?


Processing Chunks:  21%|██▏       | 9/42 [00:30<01:47,  3.27s/it]

questions: 
1. What are some tasks that AI-powered systems can automate in cybersecurity?
2. What is a challenge in developing AI systems?
3. What is a concern about the fairness of AI systems?
4. What is a characteristic of some AI systems that makes it difficult to understand how they make decisions?
5. What is a result of AI systems inheriting biases present in the data they are trained on?


Processing Chunks:  24%|██▍       | 10/42 [00:33<01:41,  3.16s/it]

questions: 
1. What is a major issue with many AI systems?
2. What is crucial for building trust in AI systems?
3. What is a concern about AI systems relying on large amounts of data?
4. What is a challenge related to the automation capabilities of AI?
5. What is a potential issue as AI systems become more autonomous?


Processing Chunks:  26%|██▌       | 11/42 [00:36<01:35,  3.08s/it]

questions: 
1. What is crucial for AI development and deployment?
2. What are the significant concerns related to the use of AI in autonomous weapons systems?
3. What is the primary goal of Explainable AI (XAI)?
4. What is AI at the Edge?
5. What is the result of using AI at the Edge?


Processing Chunks:  29%|██▊       | 12/42 [00:39<01:36,  3.21s/it]

questions: 
1. What is the main advantage of processing data locally on devices using AI at the edge?
2. What are some areas where quantum computing has the potential to accelerate AI algorithms?
3. What is the future of human-AI collaboration likely to involve?
4. What challenges is AI being used to address in the field of healthcare?
5. What is a potential benefit of using AI at the edge?


Processing Chunks:  31%|███       | 13/42 [00:43<01:36,  3.32s/it]

questions: 
1. What are some of the social and environmental challenges that AI is being used to address?
2. What is the main goal of AI for social good initiatives?
3. What are some of the key areas of focus for regulation and governance in AI development?
4. What is essential for realizing the full potential of AI while mitigating its risks?
5. What will be important for establishing standards in AI development?


Processing Chunks:  33%|███▎      | 14/42 [00:46<01:28,  3.15s/it]

questions: 
1. What are the essential components for realizing the full potential of AI?
2. In what fields are AI-powered robots used?
3. What tasks do industrial robots perform in manufacturing?
4. What type of robots assist humans in tasks such as cleaning and healthcare?
5. What does AI enable in service robots?


Processing Chunks:  36%|███▌      | 15/42 [00:48<01:23,  3.07s/it]

questions: 
1. What tasks do service robots assist humans in?
2. What is a key benefit of AI-powered surgical robots?
3. What environments are exploration robots designed to operate in?
4. What approach to robot learning involves training robots by observing human demonstrations?
5. What type of learning allows robots to learn complex behaviors without explicit programming?


Processing Chunks:  38%|███▊      | 16/42 [00:51<01:19,  3.06s/it]

questions: 
1. What is the primary method of training robots in the approach mentioned in the text?
2. What is the purpose of reinforcement learning in robot training?
3. What is SLAM technology used for in robots?
4. What abilities does computer vision provide to robots?
5. What type of navigation is enabled by SLAM technology?


Processing Chunks:  40%|████      | 17/42 [00:55<01:16,  3.07s/it]

questions: 
1. What are some abilities of AI that are mentioned in the text?
2. What are the benefits of AI in transforming business operations?
3. How does AI enhance Customer Relationship Management (CRM) systems?
4. What are some tasks that AI-powered systems can perform in supply chain operations?
5. What is one way AI improves supply chain operations?


Processing Chunks:  43%|████▎     | 18/42 [00:58<01:14,  3.09s/it]

questions: 
1. What are three main areas where AI is used in supply chain operations?
2. What are the benefits of using AI in HR for talent acquisition?
3. How does AI enhance marketing and sales efforts?
4. What is one way AI-powered systems analyze large datasets in financial services?
5. What is one function of AI in automating financial processes?


Processing Chunks:  45%|████▌     | 19/42 [01:01<01:11,  3.12s/it]

questions: 
1. What are some areas where AI is used in the financial industry?
2. What are the potential impacts of AI on the workforce?
3. What type of tasks are more likely to be automated by AI?
4. What are reskilling and upskilling initiatives used for?
5. How do AI systems collaborate with humans in the future of work?


Processing Chunks:  48%|████▊     | 20/42 [01:04<01:09,  3.16s/it]

questions: 
1. What are some benefits of collaboration between humans and AI systems?
2. What are some new job roles created by the development and deployment of AI?
3. What are some ethical considerations in using AI in the workplace?
4. What are some areas where AI-powered systems can be used for creativity and innovation?
5. What types of original works can AI algorithms create?


Processing Chunks:  50%|█████     | 21/42 [01:07<01:06,  3.18s/it]

questions: 
1. What are some examples of original works of art created by AI algorithms?
2. How do AI-powered tools assist musicians in the creative process?
3. What are some tasks that AI-powered writing tools can assist writers with?
4. What is one way AI accelerates innovation?
5. What are some areas where AI-powered tools are used in research and development?


Processing Chunks:  52%|█████▏    | 22/42 [01:10<01:00,  3.04s/it]

questions: 
1. What are AI-powered tools used for in various industries?
2. How does AI enable personalized learning experiences?
3. What do AI-powered assessments adjust based on student performance?
4. What can AI-powered virtual tutors and learning assistants do for students?
5. What can AI-powered assessments identify in students?


Processing Chunks:  55%|█████▍    | 23/42 [01:13<00:56,  2.99s/it]

questions: 
1. What role do tutors play in the context of education?
2. What is the primary function of AI in the grading and feedback process?
3. What is the purpose of Educational Data Mining?
4. How does AI contribute to medical diagnosis and treatment?
5. What benefits do AI-powered tools provide in medical care?


Processing Chunks:  57%|█████▋    | 24/42 [01:16<00:54,  3.05s/it]

questions: 
1. What are some applications of AI in healthcare?
2. How does AI impact the time and cost of bringing new treatments to market?
3. What is a benefit of personalized medicine enabled by AI?
4. What is a result of AI-powered robotic surgery systems in complex procedures?
5. What type of data is analyzed by AI in drug discovery and development?


Processing Chunks:  60%|█████▉    | 25/42 [01:19<00:49,  2.92s/it]

questions: 
1. What benefits do AI-powered systems bring to patient outcomes?
2. How does AI improve cybersecurity?
3. What is the main function of AI-powered anomaly detection systems?
4. What is one way AI is used in fraud detection?
5. What is the result of AI streamlining healthcare administration?


Processing Chunks:  62%|██████▏   | 26/42 [01:21<00:45,  2.82s/it]

questions: 
1. What is AI used for in fraud detection?
2. What is a benefit of AI-powered vulnerability management tools?
3. What is an advantage of AI-powered systems in incident response?
4. What societal challenges can AI potentially address?
5. What is a result of AI-powered vulnerability scanning?


Processing Chunks:  64%|██████▍   | 27/42 [01:24<00:43,  2.90s/it]

questions: 
1. What are some societal challenges that AI has the potential to address?
2. What are some key benefits of AI-powered solutions in addressing societal challenges?
3. What are the main goals of AI for social good initiatives?
4. What are some ethical considerations for ensuring AI's positive social impact?
5. What is necessary for building public trust in AI?


Processing Chunks:  67%|██████▋   | 28/42 [01:27<00:40,  2.89s/it]

questions: 
1. What is essential for the widespread adoption and positive social impact of AI?
2. What does global collaboration in AI involve?
3. What are the benefits of AI in urban planning and management?
4. How do AI-powered smart transportation systems use data?
5. What is one way AI enhances quality of life in cities?


Processing Chunks:  69%|██████▉   | 29/42 [01:31<00:39,  3.00s/it]

questions: 
1. What are the main goals of AI-powered smart transportation systems?
2. How do AI-powered systems enhance grid stability in smart cities?
3. What is one way AI-powered systems improve public safety and security in smart cities?
4. What are AI-powered environmental monitoring systems able to track?
5. What types of energy sources are supported by AI-powered energy management systems?


Processing Chunks:  71%|███████▏  | 30/42 [01:34<00:35,  2.99s/it]

questions: 
1. What is the primary function of AI-powered environmental monitoring systems?
2. What are the goals of research in Explainable AI (XAI)?
3. What is the focus of research in deep learning?
4. What is the intersection of AI being explored in research?
5. What is the outcome of AI-powered environmental monitoring systems?


Processing Chunks:  74%|███████▍  | 31/42 [01:37<00:33,  3.01s/it]

questions: 
1. What area of research combines understanding the human brain with AI development?
2. What is the purpose of human-centered AI?
3. What type of AI models are capable of creating original content, including images, text, and music?
4. What is one method for verifying AI behavior?
5. What is a benefit of human-centered AI?


Processing Chunks:  76%|███████▌  | 32/42 [01:40<00:30,  3.01s/it]

questions: 
1. What types of content can generative AI models create?
2. What AI models are mentioned as capable of creating original content?
3. How is AI used in the music and sound design industry?
4. What tasks can AI tools assist with in the creative process?
5. What type of experiences can AI-powered tools create in music?


Processing Chunks:  79%|███████▊  | 33/42 [01:43<00:28,  3.13s/it]

questions: 
1. What are some tasks that AI-powered tools assist with in the design process?
2. What type of media is enhanced by AI-powered characters, dynamic environments, and personalized content?
3. How does AI help with climate change mitigation?
4. What are some areas where AI is used in the field of agriculture?
5. What are some benefits of using AI in visual arts and design?


Processing Chunks:  81%|████████  | 34/42 [01:46<00:24,  3.05s/it]

questions: 
1. What are some of the applications of AI in environmental conservation?
2. How do AI-powered systems aid in precision agriculture?
3. What data sources are used in AI-powered wildlife conservation?
4. What is one way AI enhances disaster response efforts?
5. What is the primary function of AI-powered environmental monitoring systems?


Processing Chunks:  83%|████████▎ | 35/42 [01:49<00:20,  3.00s/it]

questions: 
1. What are the main functions of AI systems in disaster response?
2. What are the key considerations in developing AI strategies and policy frameworks?
3. What are some of the issues governments are considering when regulating AI?
4. How do AI systems enhance disaster response efforts?
5. What is a key challenge in regulating AI?


Processing Chunks:  86%|████████▌ | 36/42 [01:52<00:17,  2.92s/it]

questions: 
1. What is a key challenge in AI development?
2. Who plays a crucial role in funding AI research and development?
3. What is the purpose of international cooperation in AI development?
4. What is a crucial step for building trust in AI development?
5. What drives innovation in AI research and development?


Processing Chunks:  88%|████████▊ | 37/42 [01:55<00:14,  2.99s/it]

questions: 
1. What are the key principles of Ethical AI?
2. What can AI systems inherit from the data they are trained on?
3. What is the purpose of Explainable AI (XAI) techniques?
4. What are the essential factors for addressing bias in AI systems?
5. What is the primary goal of transparency and explainability in AI systems?


Processing Chunks:  90%|█████████ | 38/42 [01:58<00:11,  2.97s/it]

questions: 
1. What is the main goal of Explainable AI (XAI) techniques?
2. What are the concerns related to AI systems?
3. What are two essential aspects for ensuring responsible data handling in AI?
4. Who are the key stakeholders responsible for AI systems?
5. What is crucial for building trust in AI systems?


Processing Chunks:  93%|█████████▎| 39/42 [02:01<00:08,  2.98s/it]

questions: 
1. What is essential for building trust in AI systems?
2. What is the purpose of testing and validating AI models?
3. What allows users to enhance trust in AI systems?
4. What is crucial for building trust in AI systems during the design and development phase?
5. What allows users to customize AI settings and understand their data usage?


Processing Chunks:  95%|█████████▌| 40/42 [02:03<00:05,  2.93s/it]

questions: 
1. What are the key components of building trust in AI?
2. What is the purpose of public engagement and education in AI?
3. What are the essential elements for advancing AI capabilities?
4. What is the goal of responsible development and deployment of AI?
5. What is necessary for addressing the challenges of AI?


Processing Chunks:  98%|█████████▊| 41/42 [02:07<00:03,  3.02s/it]

questions: 
1. What are the key principles involved in responsible AI development and deployment?
2. Why is global collaboration and cooperation important for addressing AI-related challenges?
3. What is required to prepare the workforce for the future of AI?
4. What type of education is promoted to equip individuals with AI-related skills?
5. What approach focuses on developing AI systems with human values in mind?


Processing Chunks: 100%|██████████| 42/42 [02:10<00:00,  3.11s/it]

questions: 
1. What is a key goal of a human-centered approach to AI?
2. What are the primary considerations when developing AI systems using a human-centered approach?
3. What are the benefits of embracing a human-centered approach to AI development?
4. What is the primary focus of a human-centered approach to AI development?
5. What is required for the path forward in AI development and deployment?





In [None]:
def semantic_search(query, vector_store, k=5):
    """
    Performs semantic search using the query and vector store.

    Args:
    query (str): The search query.
    vector_store (SimpleVectorStore): The vector store to search in.
    k (int): Number of results to return.

    Returns:
    List[Dict]: Top k most relevant items.
    """
    # Create embedding for the query
    query_embedding = create_embeddings(query)[0].reshape(1, -1)

    # Search the vector store
    results = vector_store.similarity_search(query_embedding, k=k)

    return results


In [None]:
# Load the validation data from a JSON file
with open('data/val.json') as f:
    data = json.load(f)

# Extract the first query from the validation data
query = data[0]['question']

# Perform semantic search to find relevant content
search_results = semantic_search(query, vector_store, k=5)

print("Query:", query)
print("\nSearch Results:")

# Organize results by type
chunk_results = []
question_results = []

for result in search_results:
    if result["metadata"]["type"] == "chunk":
        chunk_results.append(result)
    else:
        question_results.append(result)

# Print chunk results first
print("\nRelevant Document Chunks:")
for i, result in enumerate(chunk_results):
    print(f"Context {i + 1} (similarity: {result['similarity']:.4f}):")
    print(result["text"][:300] + "...")
    print("=====================================")

# Then print question matches
print("\nMatched Questions:")
for i, result in enumerate(question_results):
    print(f"Question {i + 1} (similarity: {result['similarity']:.4f}):")
    print(result["text"])
    chunk_idx = result["metadata"]["chunk_index"]
    print(f"From chunk {chunk_idx}")
    print("=====================================")

Query: What is 'Explainable AI' and why is it considered important?

Search Results:

Relevant Document Chunks:

Matched Questions:
Question 1 (similarity: 0.9364):
What is the purpose of Explainable AI (XAI) techniques?
From chunk 36
Question 2 (similarity: 0.9232):
What is the primary goal of Explainable AI (XAI)?
From chunk 10
Question 3 (similarity: 0.9223):
What is the main goal of Explainable AI (XAI) techniques?
From chunk 37
Question 4 (similarity: 0.9209):
What is the primary goal of transparency and explainability in AI systems?
From chunk 36
Question 5 (similarity: 0.9116):
What are the goals of research in Explainable AI (XAI)?
From chunk 29


In [47]:
def prepare_context(search_results):
    """
    Prepares a unified context from search results for response generation.

    Args:
    search_results (List[Dict]): Results from semantic search.

    Returns:
    str: Combined context string.
    """
    # Extract unique chunks referenced in the results
    chunk_indices = set()
    context_chunks = []

    # First add direct chunk matches
    for result in search_results:
        if result["metadata"]["type"] == "chunk":
            chunk_indices.add(result["metadata"]["index"])
            context_chunks.append(f"Chunk {result['metadata']['index']}:\n{result['text']}")

    # Then add chunks referenced by questions
    for result in search_results:
        if result["metadata"]["type"] == "question":
            chunk_idx = result["metadata"]["chunk_index"]
            if chunk_idx not in chunk_indices:
                chunk_indices.add(chunk_idx)
                context_chunks.append(f"Chunk {chunk_idx} (referenced by question '{result['text']}'):\n{result['metadata']['original_chunk']}")

    # Combine all context chunks
    full_context = "\n\n".join(context_chunks)
    return full_context


In [None]:
# use api
from dotenv  import load_dotenv
load_dotenv("conf.env")

client = OpenAI(
    base_url = os.getenv("GEMINI_BASE_URL"),
    api_key = os.getenv("GEMINI_API_KEY")
)

In [49]:
def generate_response(query, context, model="gemini-2.5-flash"):
    """
    Generates a response based on the query and context.

    Args:
    query (str): User's question.
    context (str): Context information retrieved from the vector store.
    model (str): Model to use for response generation.

    Returns:
    str: Generated response.
    """
    system_prompt = "You are an AI assistant that strictly answers based on the given context. If the answer cannot be derived directly from the provided context, respond with: 'I do not have enough information to answer that.'"

    user_prompt = f"""
        Context:
        {context}

        Question: {query}

        Please answer the question based only on the context provided above. Be concise and accurate.
    """

    response = client.chat.completions.create(
        model=model,
        temperature=0,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
    )

    return response.choices[0].message.content


In [50]:
# Prepare context from search results
context = prepare_context(search_results)

# Generate response
response_text = generate_response(query, context)

print("\nQuery:", query)
print("\nResponse:")
print(response_text)


Query: What is 'Explainable AI' and why is it considered important?

Response:
Explainable AI (XAI) aims to make AI systems and their decisions more transparent and understandable, providing insights into how AI models make decisions. It is considered important for building trust in AI systems, enabling users to assess their fairness, accuracy, and reliability, and enhancing accountability.


In [53]:
def evaluate_response(query, response, reference_answer, model="gemini-2.5-flash"):
    """
    Evaluates the AI response against a reference answer.

    Args:
    query (str): The user's question.
    response (str): The AI-generated response.
    reference_answer (str): The reference/ideal answer.
    model (str): Model to use for evaluation.

    Returns:
    str: Evaluation feedback.
    """
    # Define the system prompt for the evaluation system
    evaluate_system_prompt = """You are an intelligent evaluation system tasked with assessing AI responses.

        Compare the AI assistant's response to the true/reference answer, and evaluate based on:
        1. Factual correctness - Does the response contain accurate information?
        2. Completeness - Does it cover all important aspects from the reference?
        3. Relevance - Does it directly address the question?

        Assign a score from 0 to 1:
        - 1.0: Perfect match in content and meaning
        - 0.8: Very good, with minor omissions/differences
        - 0.6: Good, covers main points but misses some details
        - 0.4: Partial answer with significant omissions
        - 0.2: Minimal relevant information
        - 0.0: Incorrect or irrelevant

        Provide your score with justification.
    """

    # Create the evaluation prompt
    evaluation_prompt = f"""
        User Query: {query}

        AI Response:
        {response}

        Reference Answer:
        {reference_answer}

        Please evaluate the AI response against the reference answer.
    """

    # Generate evaluation
    eval_response = client.chat.completions.create(
        model=model,
        temperature=0,
        messages=[
            {"role": "system", "content": evaluate_system_prompt},
            {"role": "user", "content": evaluation_prompt}
        ]
    )

    return eval_response.choices[0].message.content


In [54]:
# Get reference answer from validation data
reference_answer = data[0]['ideal_answer']

# Evaluate the response
evaluation = evaluate_response(query, response_text, reference_answer)

print("\nEvaluation:")
print(evaluation)


Evaluation:
Score: 1.0

Justification: The AI response perfectly captures the definition of Explainable AI (XAI) and accurately lists the key reasons for its importance (transparency, understanding, trust, fairness, accountability), aligning almost identically with the reference answer. The slight expansion on "enabling users to assess their fairness, accuracy, and reliability" is a valid and relevant elaboration that enhances the answer without introducing any inaccuracies or deviating from the core message of the reference.


In [None]:
ai_res = create_embeddings(ai_response_text)[0].reshape(1, -1)
ideal_res = create_embeddings(data[0]['ideal_answer'])[0].reshape(1, -1)

print(cosine_similarity(ai_res, ideal_res))

[[0.99685514]]
