# Import Required Libraries

In [1]:
!pip install pdfplumber sentence-transformers faiss-cpu fastapi chromadb uvicorn nest_asyncio

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting fastapi
  Downloading fastapi-0.115.0-py3-none-any.whl.metadata (27 kB)
Collecting chromadb
  Downloading chromadb-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Collecting uvicorn
  Downloading uvicorn-0.31.1-py3-none-any.whl.metadata (6.6 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading 

In [2]:
import pdfplumber
import re
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from fastapi import FastAPI, Request
import uvicorn
import json

  from tqdm.autonotebook import tqdm, trange


# Step 1: Data Collection & Ingestion (Extracting text from the PDF)

In [3]:
# Define text cleaning function
def clean_extracted_text(text):

    # Remove artifacts like (cid:145) and other non-text elements
    cleaned_text = re.sub(r'\(cid:\d+\)', '', text)

    # Preserve paragraphs by replacing multiple newlines (\n\n) with a special marker <PARAGRAPH>
    cleaned_text = re.sub(r'\n\s*\n', ' <PARAGRAPH> ', cleaned_text)

    # Remove remaining single newlines that break sentences
    cleaned_text = re.sub(r'\n+', ' ', cleaned_text)

    # Replace the <PARAGRAPH> marker with double newlines to restore paragraph breaks
    cleaned_text = re.sub(r'<PARAGRAPH>', '\n\n', cleaned_text)

    # Remove non-ASCII characters that might cause issues (leave only printable characters)
    cleaned_text = re.sub(r'[^\x00-\x7F]+', '', cleaned_text)

    # Remove extra spaces between characters like "C H A P T E R  O N E"
    cleaned_text = re.sub(r'\b([A-Z])\s+', r'\1', cleaned_text)

    # Normalize multiple spaces to a single space
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)

    # Strip leading/trailing whitespace
    cleaned_text = cleaned_text.strip()

    return cleaned_text

# Extract text from PDF with cleaning
def extract_text(pdf_path):

    full_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                # Clean each page's extracted text
                cleaned_page_text = clean_extracted_text(page_text)
                full_text += cleaned_page_text + " "  # Concatenate with space between pages
    return full_text

# PDF extraction
book_text = extract_text("/content/drive/MyDrive/Harry Potter and the Prisoner of Azkaban.pdf")
print(book_text[2000:3000])

.S.A. 12 First American edition, October 1999 Contents ONE Owl Post 1 TWO Aunt Marges Big Mistake 16 THREE The Knight Bus 31 FOUR The Leaky Cauldron 49 FIVE The Dementor 69 SIX Talons and Tea Leaves 96 SEVEN The Boggart in the Wardrobe 123 EIGHT Flight of the Fat Lady 141 vii Contents NINE Grim Defeat 162 TEN The Marauders Map 183 ELEVEN The Firebolt 211 TWELVE The Patronus 233 THIRTEEN Gryffindor Versus Ravenclaw 252 FOURTEEN Snapes Grudge 269 FIFTEEN The Quidditch Final 291 SIXTEEN Professor Trelawneys Prediction 314 SEVENTEEN Cat, Rat, and Dog 332 viii Contents EIGHTEEN Moony, Wormtail, Padfoot, and Prongs 349 NINETEEN The Servant of Lord Voldemort 358 TWENTY The Dementors Kiss 378 TWENTY-ONE Hermiones Secret 386 TWENTY-TWO Owl Post Again 416 ix Harry Potter and the Prisoner of Azkaban CHAPTERONEOWL POST Harry Potter was a highly unusual boy in many ways. For one thing, he hated the summer holidays more than any other time of year. For another, he really wanted to do his home- work 

# Step 2: Data Chunking & Preprocessing (Splitting into chapters and chunks)

In [4]:
def split_into_chapters(text):
    # Handle spaced-out letters like "C H A P T E R"
    text = re.sub(r'\b(C)\s+(H)\s+(A)\s+(P)\s+(T)\s+(E)\s+(R)', r'CHAPTER', text)

    # Regex to match chapter titles, considering formats like:
    # - "CHAPTER ONE", "CHAPTER 1", "CHAPTER  O N E"
    chapter_pattern = r'(CHAPTER\s+\w+)'  # Matches "CHAPTER" followed by words/numbers

    # Split the text based on the identified chapter markers
    chapters = re.split(chapter_pattern, text)

    structured_data = []

    # Loop through the list and combine chapter titles with their corresponding content
    for i in range(1, len(chapters), 2):
        chapter_title = chapters[i].strip()  # Chapter title (e.g., "CHAPTER ONE")
        chapter_text = chapters[i + 1].strip() if i + 1 < len(chapters) else ''  # Chapter text
        structured_data.append({
            "chapter": chapter_title,
            "text": chapter_text
        })

    return structured_data

# Apply chapter splitting and print some sample results for verification
structured_data = split_into_chapters(book_text)

# Split into chunks with overlap
def chunk_text_with_overlap(text, chunk_size=100, overlap=20):

    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunks.append(' '.join(words[i:i + chunk_size]))
    return chunks

# Apply chapter splitting and chunking
structured_data = split_into_chapters(book_text)

data_chunks = []
for chapter in structured_data:
    chunks = chunk_text_with_overlap(chapter['text'], chunk_size=100, overlap=20)
    for chunk in chunks:
        data_chunks.append({
            "chapter": chapter['chapter'],
            "chunk": chunk,
            "metadata": {
                "chapter_title": chapter['chapter'],
                "text_length": len(chunk.split())
            }
        })

# Step 5: Embedding Generation (Using a pre-trained model to generate embeddings)

In [5]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Generate embeddings for each chunk
embeddings = model.encode([chunk['chunk'] for chunk in data_chunks])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Step 4: Vector Database Integration (Storing embeddings in FAISS)

In [6]:
# Create FAISS index
embedding_dim = 384
index = faiss.IndexFlatL2(embedding_dim)

# Convert embeddings into a numpy array and add to the index
embedding_array = np.array(embeddings).astype('float32')
index.add(embedding_array)

# Step 5: Query Handling & Retrieval (Using the vector database to handle queries)

In [7]:
def retrieve_relevant_chunks(query, top_n=5):

    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_n)
    results = [data_chunks[idx] for idx in indices[0]]
    return results

In [8]:
# Example query
query = "What is the significance of the Marauder’s Map?"
relevant_chunks = retrieve_relevant_chunks(query)
for result in relevant_chunks:
    print(f"Chapter: {result['chapter']}, Chunk: {result['chunk'][:100]}...")

Chapter: CHAPTER TEN, Chunk: web from the point that Georges wand had touched. They joined each other, they crisscrossed, they fa...
Chapter: CHAPTER TWENTY, Chunk: Ibrought this from the Shrieking Shack last night, he said, handing Harry back the Invisibility Cloa...
Chapter: CHAPTER TEN, Chunk: Harry! squealed Hermione. What are you doing here? How how did you ? Wow! said Ron, looking very imp...
Chapter: CHAPTER TEN, Chunk: Quickly and silently, Harry dodged out from his hiding place and climbed the stairs; looking back, h...
Chapter: CHAPTER NINE, Chunk: it upside down, and tipped a dozen bits of splintered wood and twig onto the bed, the only remains o...


# Step 6: Contextual Response Generation (Using retrieved chunks to generate a response)

In [12]:
def generate_contextual_response(query, relevant_chunks):

    # Generates a response to a query using the most relevant text chunks.
    context = " ".join([chunk['chunk'] for chunk in relevant_chunks])
    response = f"According to the book, {context}."
    return response

# Generating a response for example query
response = generate_contextual_response(query, relevant_chunks)
print(response)

According to the book, web from the point that Georges wand had touched. They joined each other, they crisscrossed, they fanned into every corner of the parch- ment; then words began to blossom across the top, great, curly green words, that proclaimed: Messrs. Moony, Wormtail, Padfoot, and Prongs Purveyors of Aids to Magical Mischief-Makers are proud to present THE MARAUDERS MAP 192 THE MARAUDERS MAP It was a map showing every detail of the Hogwarts castle and grounds. But the truly remarkable thing were the tiny ink dots moving around it, each labeled with a name in minuscule writing. Astounded, Harry bent over it. Ibrought this from the Shrieking Shack last night, he said, handing Harry back the Invisibility Cloak. And . . . He hes- itated, then held out the Marauders Map too. Iam no longer your teacher, so Idont feel guilty about giving you back this as well. Its no use to me, and Idaresay you, Ron, and Hermione will find uses for it. Harry took the map and grinned. You told me Moon

# Step 7: Serving via FastAPI (Building the RAG system with an API endpoint)

In [13]:
# Necessary imports
import uvicorn
from fastapi import FastAPI, Request
import nest_asyncio

app = FastAPI()

@app.post("/query")
async def query_rag(request: Request):
    # FastAPI endpoint to handle user queries and return responses with relevant chunks.

    data = await request.json()
    user_query = data['question']

    return {"response": f"You asked: {user_query}"}

nest_asyncio.apply()

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)


INFO:     Started server process [701]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [701]


In [15]:
import requests
import json

url = "http://127.0.0.1:8000/query"
headers = {"Content-Type": "application/json"}
data = {
    "question": "What is the Marauder’s Map?"
}

# Convert the data to a JSON string and make the POST request
response = requests.post(url, headers=headers, data=json.dumps(data))

print(response.text)

ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=8000): Max retries exceeded with url: /query (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7902f9451450>: Failed to establish a new connection: [Errno 111] Connection refused'))