# SimpleRAG with OpenAI and Python

In [2]:
import pdfplumber
import re
import os
import numpy as np
import json
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()


True

In [3]:
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file and prints the first `num_chars` characters.

    Args:
    pdf_path (str): Path to the PDF file.

    Returns:
    str: Extracted text from the PDF.
    """
    # Open the PDF file
    mypdf = pdfplumber.open(pdf_path)  # Use pdfplumber to open the PDF file
    all_text = ""  # Initialize an empty string to store the extracted text

    # Iterate through each page in the PDF
    for page in mypdf.pages:
        text = page.extract_text()  # Extract text from the page
        all_text += text  # Append the extracted text to the all_text string

    return all_text  # Return the extracted text

In [4]:
def chunk_text(text, n, overlap):
    """
    Chunks the given text into segments of n characters with overlap.

    Args:
    text (str): The text to be chunked.
    n (int): The number of characters in each chunk.
    overlap (int): The number of overlapping characters between chunks.

    Returns:
    List[str]: A list of text chunks.
    """
    chunks = []  # Initialize an empty list to store the chunks
    
    # Loop through the text with a step size of (n - overlap)
    for i in range(0, len(text), n - overlap):
        # Append a chunk of text from index i to i + n to the chunks list
        chunks.append(text[i:i + n])

    return chunks  # Return the list of text chunks

In [5]:
# Initialize the OpenAI client with the base URL and API key
client = OpenAI(
    base_url=os.getenv("OPENAI_API_BASE"),  # Retrieve the base URL from environment variables  # Retrieve the API version from environment variables     
    api_key=os.getenv("OPENAI_API_KEY")  # Retrieve the API key from environment variables
)

In [None]:
# Define the path to the PDF file
pdf_path = r"C:\DataScience - LLM\Story_of_a_Lifetime.pdf"  # Path to the PDF file
# Check if the PDF file exists

# Extract text from the PDF file
extracted_text = extract_text_from_pdf(pdf_path)

# Chunk the extracted text into segments of 1000 characters with an overlap of 200 characters
text_chunks = chunk_text(extracted_text, 1000, 200)

# Print the number of text chunks created
print("Number of text chunks:", len(text_chunks))

# Print the first text chunk
print("\nFirst text chunk:")
print(text_chunks[0])

Number of text chunks: 5

First text chunk:
Chapter 1
It is said that “In the land of Lama don’t be Gama”, when I heard the saying for
the 1st time, one place came to my mind is that of Leh Ladakh. Lush Green
Meadows, barren mountains, blueish lakes from Fables, Gompas dotting the
landscape for miles. For whole year COVID-19 swept across the globe wiping away
almost half the population, millions jobless and also similar holocaust engulfed
India as well. I work at a reputed MNC as HR and settled in Kolkata with my wife
and Dad. As COVID swept across India our company announced Work from Home
there by retaining our most of our JOBs and employees, people at IT were really
blessed to be a part of this, most of us had our jobs!! Like others we were also
scared to go out of the house amidst the crisis. Until we got ourselves vaccinated
with 2nd Dose, till that we literally locked ourselves in closed doors as it was
recommended by Govt. Myself including my family started feeling claustrophobi

In [10]:
def create_ollama_embeddings(text, model="llama3.2"):
    """
    Creates embeddings for the given text using Ollama's local models.

    Args:
    text (str or List[str]): The input text(s) for which embeddings are to be created.
    model (str): The Ollama model to use. Default is "llama3".

    Returns:
    list: List of embeddings for the input text(s).
    """
    import requests
    
    # Convert single string to list for consistent processing
    if isinstance(text, str):
        text = [text]
    
    embeddings = []
    
    # Process each text chunk
    for chunk in text:
        try:
            response = requests.post(
                "http://localhost:11434/api/embeddings",
                json={"model": model, "prompt": chunk}
            )
            if response.status_code == 200:
                result = response.json()
                embeddings.append(result['embedding'])
            else:
                print(f"Error: API returned status code {response.status_code}")
                
        except requests.exceptions.RequestException as e:
            print(f"Error making request: {e}")
            
    return embeddings

# Example usage:
# response = create_ollama_embeddings(text_chunks, model="llama2")

response = create_ollama_embeddings(text_chunks, model="llama3.2")

In [28]:
#response = np.array(response)  # Convert the response to a NumPy array for easier manipulation
# Print the shape of the response array
print(type(response))

<class 'numpy.ndarray'>


In [27]:
def cosine_similarity(vec1, vec2):
    """
    Calculates the cosine similarity between two vectors.

    Args:
    vec1 (np.ndarray): The first vector.
    vec2 (np.ndarray): The second vector.

    Returns:
    float: The cosine similarity between the two vectors.
    """
    # Compute the dot product of the two vectors and divide by the product of their norms
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [29]:
def semantic_search(query, text_chunks, embeddings, k=5):
    """
    Performs semantic search on the text chunks using the given query and embeddings.

    Args:
    query (str): The query for the semantic search.
    text_chunks (List[str]): A list of text chunks to search through.
    embeddings (List[List[float]]): A list of embeddings for the text chunks.
    k (int): The number of top relevant text chunks to return. Default is 5.

    Returns:
    List[str]: A list of the top k most relevant text chunks based on the query.
    """
    # Create an embedding for the query
    query_embedding = create_ollama_embeddings(query, model="llama3.2")[0]
    similarity_scores = []  # Initialize a list to store similarity scores

    # Calculate similarity scores between the query embedding and each text chunk embedding
    for i, chunk_embedding in enumerate(embeddings):
        # Direct use of chunk_embedding as it's already a list of floats
        similarity_score = cosine_similarity(np.array(query_embedding), np.array(chunk_embedding))
        similarity_scores.append((i, similarity_score))

    # Sort the similarity scores in descending order
    similarity_scores.sort(key=lambda x: x[1], reverse=True)
    # Get the indices of the top k most similar text chunks
    top_indices = [index for index, _ in similarity_scores[:k]]
    # Return the top k most relevant text chunks
    return [text_chunks[index] for index in top_indices]


In [36]:
# Load the validation data from a JSON file
with open('val.json') as f:
    data = json.load(f)

# Extract the first query from the validation data
#query = data[0]['question']
query = "What is the author talking about the place?"

In [37]:
# The response variable already contains the embeddings, pass it directly
top_chunks = semantic_search(query, text_chunks, response, k=2)

# Print the query
print("Query:", query)

# Print the top 2 most relevant text chunks
for i, chunk in enumerate(top_chunks):
    print(f"Context {i + 1}:\n{chunk}\n=====================================")

Query: What is the author talking about the place?
Context 1:
from the Siachen
Glacier in the Karakoram range to the north to the main Great Himalayas to the
south. The eastern end, consisting of the uninhabited Aksai Chin plains, is claimedby the Indian Government as part of Ladakh, and has been under Chinese control
since 1962.
Context 2:
Chapter 1
It is said that “In the land of Lama don’t be Gama”, when I heard the saying for
the 1st time, one place came to my mind is that of Leh Ladakh. Lush Green
Meadows, barren mountains, blueish lakes from Fables, Gompas dotting the
landscape for miles. For whole year COVID-19 swept across the globe wiping away
almost half the population, millions jobless and also similar holocaust engulfed
India as well. I work at a reputed MNC as HR and settled in Kolkata with my wife
and Dad. As COVID swept across India our company announced Work from Home
there by retaining our most of our JOBs and employees, people at IT were really
blessed to be a part of

In [38]:
# Define the system prompt for the AI assistant
system_prompt = "You are an AI assistant that strictly answers based on the given context. If the answer cannot be derived directly from the provided context, respond with: 'I do not have enough information to answer that.'"

def generate_response(system_prompt, user_message, model="llama3.2"):
    """
    Generates a response from the AI model based on the system prompt and user message.

    Args:
    system_prompt (str): The system prompt to guide the AI's behavior.
    user_message (str): The user's message or query.
    model (str): The model to be used for generating the response. Default is "llama3.2".

    Returns:
    dict: The response from the AI model.
    """
    response = client.chat.completions.create(
        model=model,
        temperature=0,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message}
        ]
    )
    return response

# Perform semantic search to get the top chunks
top_chunks = semantic_search(query, text_chunks, response, k=2)

# Create the user prompt based on the top chunks
user_prompt = "\n".join([f"Context {i + 1}:\n{chunk}\n=====================================\n" for i, chunk in enumerate(top_chunks)])
user_prompt = f"{user_prompt}\nQuestion: {query}"

# Generate AI response
ai_response = generate_response(system_prompt, user_prompt)

In [None]:
ai_response.choices[0].message.content.strip()  # type: ignore # Extract the content of the AI's response
# Print the AI's response
print("AI Response:", ai_response.choices[0].message.content.strip())    # type: ignore

AI Response: I do not have enough information to answer that. The provided text does not mention a book, but rather two separate contexts that appear to be unrelated. Context 1 discusses the geographical location of Ladakh, while Context 2 describes the author's personal experience during the COVID-19 pandemic. Without more context or information about the book being referred to, it is impossible to determine its main theme.
