### NOTE!!!

This notebook uses open-source embedding and generative models, requiring atleast 8GB VRAM, because of cost. Consider using Colab if your GPU can't handle it.

In [11]:
!pip install PymuPDF
!pip install python-dotenv
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-c

In [1]:
import os
import fitz
from dotenv import load_dotenv
import numpy as np
import json

from openai import OpenAI
from tqdm import tqdm

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def extract_text_from_pdf(pdf_path):
    pdf = fitz.open(pdf_path)
    all_text = ""

    for page in pdf:
        all_text += page.get_text("text")

    return all_text

extracted_text = extract_text_from_pdf("AI_Information.pdf")
extracted_text

'Understanding Artificial Intelligence \nChapter 1: Introduction to Artificial Intelligence \nArtificial intelligence (AI) refers to the ability of a digital computer or computer-controlled robot \nto perform tasks commonly associated with intelligent beings. The term is frequently applied to \nthe project of developing systems endowed with the intellectual processes characteristic of \nhumans, such as the ability to reason, discover meaning, generalize, or learn from past \nexperience. Over the past few decades, advancements in computing power and data availability \nhave significantly accelerated the development and deployment of AI. \nHistorical Context \nThe idea of artificial intelligence has existed for centuries, often depicted in myths and fiction. \nHowever, the formal field of AI research began in the mid-20th century. The Dartmouth Workshop \nin 1956 is widely considered the birthplace of AI. Early AI research focused on problem-solving \nand symbolic methods. The 1980s saw 

In [8]:
device = "cuda"
gen_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path="Qwen/Qwen1.5-1.8B-Chat",
    torch_dtype="auto",
    device_map="auto"
    )
gen_tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path="Qwen/Qwen1.5-1.8B-Chat"
    )
embed_model = AutoModel.from_pretrained("BAAI/bge-base-en")
embed_tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en")

In [22]:
def generate_chunk_header(chunk, model="Qwen/Qwen1.5-1.8B-Chat"):
    """
    Generates a title/header for a given text chunk using an LLM.

    Args:
    chunk (str): The text chunk to summarize as a header.
    model (str): The model to be used for generating the header. Default is Qwen/Qwen1.5-1.8B-Chat.

    Returns:
    str: Generated header/title.
    """
    # Define the system prompt to guide the AI's behavior
    system_prompt = """You are a content summarization expert. Your task is to generate an optimal title that encapsulates the essence of the provided text while maintaining clarity and relevance. Requirements:
- Length: 5-12 words
- Style: Clear, professional, searchable
- Content: Must represent the core message accurately
- Mandatory Format: Respond with title only, no prefix, suffix, or additional text"""

    # Generate a response from the AI model based on the system prompt and text chunk
    text = gen_tokenizer.apply_chat_template(
        conversation=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": chunk}
        ],
        tokenize=False,
        add_generation_prompt=True
    )

    model_inputs = gen_tokenizer([text], return_tensors="pt").to(device)

    generated_ids = gen_model.generate(
        **model_inputs,
        max_new_tokens=50,
        temperature=0.7,
        do_sample=True
    )

    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = gen_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    print("===========================================")
    print(f"header: {response}")
    print("===========================================")

    # Return the generated header/title, stripping any leading/trailing whitespace
    return response.strip()


In [28]:
generate_chunk_header(extracted_text[:500])

header: "Mastering AI: A Comprehensive Guide to Intelligent Computing"


'"Mastering AI: A Comprehensive Guide to Intelligent Computing"'

In [29]:
def chunk_text_with_headers(text, n, overlap):
    """
    Chunks text into smaller segments and generates headers.

    Args:
    text (str): The full text to be chunked.
    n (int): The chunk size in characters.
    overlap (int): Overlapping characters between chunks.

    Returns:
    List[dict]: A list of dictionaries with 'header' and 'text' keys.
    """
    chunks = []  # Initialize an empty list to store chunks

    # Iterate through the text with the specified chunk size and overlap
    for i in range(0, len(text), n - overlap):
        chunk = text[i:i + n]  # Extract a chunk of text
        print(chunk)
        header = generate_chunk_header(chunk)  # Generate a header for the chunk using LLM
        chunks.append({"header": header, "text": chunk})  # Append the header and chunk to the list

    return chunks  # Return the list of chunks with headers


In [30]:
text_chunks = chunk_text_with_headers(extracted_text, 1000, 200)

# Print a sample chunk with its generated header
print("Sample Chunk:")
print("Header:", text_chunks[0]['header'])
print("Content:", text_chunks[0]['text'])

Understanding Artificial Intelligence 
Chapter 1: Introduction to Artificial Intelligence 
Artificial intelligence (AI) refers to the ability of a digital computer or computer-controlled robot 
to perform tasks commonly associated with intelligent beings. The term is frequently applied to 
the project of developing systems endowed with the intellectual processes characteristic of 
humans, such as the ability to reason, discover meaning, generalize, or learn from past 
experience. Over the past few decades, advancements in computing power and data availability 
have significantly accelerated the development and deployment of AI. 
Historical Context 
The idea of artificial intelligence has existed for centuries, often depicted in myths and fiction. 
However, the formal field of AI research began in the mid-20th century. The Dartmouth Workshop 
in 1956 is widely considered the birthplace of AI. Early AI research focused on problem-solving 
and symbolic methods. The 1980s saw a rise in exp

In [41]:
[text_chunks[i]['header'] for i in range(len(text_chunks))]

['"Introduction to Artificial Intelligence: A Historical Overview and Modern Applications"',
 '"Revolutionizing AI: A Journey from Symbolic Methods to Deep Learning"',
 '"AI\'s Ethical Implications: Bias, Job Displacement, Supervised & Unsupervised Learning"',
 '"Discovering Patterns and Structures in Unlabeled Data with Reinforcement Learning and Deep Learning Techniques"',
 '"Revolutionizing Brain Structure and Function: Deep Learning\'s Impact on Image Recognition, Natural Language Processing, and Speech Recognition"',
 '"Revolutionizing Healthcare with Artificial Intelligence: Applications and Impact"',
 '"Revolutionizing Industries with AI: Applications in Medical Diagnosis, Drug Discovery, Finance, Transportation, and Manufacturing"',
 '"Revolutionizing Customer Experience: AI-Powered Systems in Storing, Manufacturing, Education, Entertainment, and Cybersecurity"',
 '"Unlocking the Black Box: Ethical and Societal Implications of AI\'s Impact on Bias, Fairness, Transparency, and E

In [42]:
embed_model.to(device)
def embed(text):
    if isinstance(text, str): text = [text] # if single string => convert into a list of one element
    inputs = embed_tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device) # tokenize input
    with torch.no_grad():
        output = embed_model(**inputs) # running model
        embedding = F.normalize(output.last_hidden_state[:, 0, :], p=2, dim=1) # normalize vector to L2
    return embedding.cpu().numpy() # pass to cpu with numpy array (n, dim)

In [43]:
# Generate embeddings for each chunk
embeddings = []  # Initialize an empty list to store embeddings

# Iterate through each text chunk with a progress bar
for chunk in tqdm(text_chunks, desc="Generating embeddings"):
    # Create an embedding for the chunk's text
    text_embedding = embed(chunk["text"])
    # Create an embedding for the chunk's header
    header_embedding = embed(chunk["header"])
    # Append the chunk's header, text, and their embeddings to the list
    embeddings.append({"header": chunk["header"], "text": chunk["text"], "embedding": text_embedding, "header_embedding": header_embedding})

Generating embeddings: 100%|██████████| 42/42 [00:02<00:00, 17.70it/s]


In [56]:
def semantic_search(query, chunks, k=5):
    """
    Searches for the most relevant chunks based on a query.

    Args:
    query (str): User query.
    chunks (List[dict]): List of text chunks with embeddings.
    k (int): Number of top results.

    Returns:
    List[dict]: Top-k most relevant chunks.
    """
    # Create an embedding for the query
    query_embedding = embed(query) # (1, dim)
    similarities = []  # Initialize an empty list to store similarities
    # similarities = [(chunk, ((cosine_similarity(query_embedding, chunk["embedding"].reshape(1, -1))) + (cosine_similarity(query_embedding, chunk["header_embedding"].reshape(1, -1))))/2)
    #                 for chunk in chunks]  # Initialize a list to store similarity scores

    # Iterate through each chunk to calculate similarity scores
    for chunk in chunks:
        # Compute cosine similarity between query embedding and chunk text embedding
        sim_text = cosine_similarity(query_embedding, chunk["embedding"].reshape(1, -1))
        # Compute cosine similarity between query embedding and chunk header embedding
        sim_header = cosine_similarity(query_embedding, chunk["header_embedding"].reshape(1, -1))
        # Calculate the average similarity score
        avg_similarity = (sim_text + sim_header) / 2
        # Append the chunk and its average similarity score to the list
        similarities.append((chunk, avg_similarity))

    # Sort the chunks based on similarity scores in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)
    # Return the top-k most relevant chunks
    return [x[0] for x in similarities[:k]]


In [57]:
# Load validation data
with open('val.json') as f:
    data = json.load(f)

query = data[0]['question']

# Retrieve the top 2 most relevant text chunks
top_chunks = semantic_search(query, embeddings, k=2)

# Print the results
print("Query:", query)
for i, chunk in enumerate(top_chunks):
    print(f"Header {i+1}: {chunk['header']}")
    print(f"Content:\n{chunk['text']}\n")

Query: What is 'Explainable AI' and why is it considered important?
Header 1: "Building Trust in Explainable AI Techniques: A Comprehensive Guide"
Content:
systems. Explainable AI (XAI) 
techniques aim to make AI decisions more understandable, enabling users to assess their 
fairness and accuracy. 
Privacy and Data Protection 
AI systems often rely on large amounts of data, raising concerns about privacy and data 
protection. Ensuring responsible data handling, implementing privacy-preserving techniques, 
and complying with data protection regulations are crucial. 
Accountability and Responsibility 
Establishing accountability and responsibility for AI systems is essential for addressing potential 
harms and ensuring ethical behavior. This includes defining roles and responsibilities for 
developers, deployers, and users of AI systems. 
Chapter 20: Building Trust in AI 
Transparency and Explainability 
Transparency and explainability are key to building trust in AI. Making AI systems u

In [None]:
# Define the system prompt for the AI assistant
client = OpenAI(
    api_key=os.getenv("GEMINI_API_KEY"),
    base_url=os.getenv("GEMINI_BASE_URL"),
)
system_prompt = "You are an AI assistant that strictly answers based on the given context. If the answer cannot be derived directly from the provided context, respond with: 'I do not have enough information to answer that.'"

def generate_response(system_prompt, user_message, model=os.getenv("GEMINI_GEN_MODEL")):
    """
    Generates a response from the AI model based on the system prompt and user message.

    Args:
    system_prompt (str): The system prompt to guide the AI's behavior.
    user_message (str): The user's message or query.
    model (str): The model to be used for generating the response. Default is "meta-llama/Llama-2-7B-chat-hf".

    Returns:
    dict: The response from the AI model.
    """
    response = client.chat.completions.create(
        model=model,
        temperature=0,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message}
        ]
    )
    return response

# Create the user prompt based on the top chunks
user_prompt = "\n".join([f"Header: {chunk['header']}\nContent:\n{chunk['text']}" for chunk in top_chunks])
user_prompt = f"{user_prompt}\nQuestion: {query}"

# Generate AI response
ai_response = generate_response(system_prompt, user_prompt)

In [62]:
ai_response_text = ai_response.choices[0].message.content
ai_response_text

'Explainable AI (XAI) techniques aim to make AI decisions more understandable, enabling users to assess their fairness and accuracy. It is considered important because making AI systems understandable and providing insights into their decision-making processes helps users assess their reliability and fairness.'

In [60]:
# Define evaluation system prompt
evaluate_system_prompt = """You are an intelligent evaluation system.
Assess the AI assistant's response based on the provided context.
- Assign a score of 1 if the response is very close to the true answer.
- Assign a score of 0.5 if the response is partially correct.
- Assign a score of 0 if the response is incorrect.
Return only the score (0, 0.5, or 1)."""

# Extract the ground truth answer from validation data
true_answer = data[0]['ideal_answer']

# Construct evaluation prompt
evaluation_prompt = f"""
User Query: {query}
AI Response: {ai_response}
True Answer: {true_answer}
{evaluate_system_prompt}
"""

# Generate evaluation score
evaluation_response = generate_response(evaluate_system_prompt, evaluation_prompt)

# Print the evaluation score
print("Evaluation Score:", evaluation_response.choices[0].message.content)

Evaluation Score: 1


In [61]:
ai_res = embed(ai_response_text)
ideal_res = embed(data[0]['ideal_answer'])

print(cosine_similarity(ai_res, ideal_res))

[[0.97825384]]
