<a href="https://colab.research.google.com/github/samiralfayed/RAG-Python-Assignment/blob/main/RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd /content/drive/MyDrive/rag-api-local

/content/drive/MyDrive/rag-api-local


Install Dependencies

In [None]:
!pip install faiss-cpu pdfplumber python-docx pytesseract easyocr \
chromadb fastapi uvicorn nest-asyncio python-multipart transformers sentence-transformers > /dev/null
!sudo apt-get update -qq
!sudo apt-get install -y tesseract-ocr
!pip install pyngrok > /dev/null

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


 Import Libraries

In [None]:
import os
import pytesseract
import pdfplumber
import docx
import faiss
import easyocr
import numpy as np
import nest_asyncio
import uvicorn
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import JSONResponse
from pyngrok import ngrok
from typing import Optional

from transformers import pipeline
from sentence_transformers import SentenceTransformer

Initialize Embeddings and LLM

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_dim = 384
index = faiss.IndexFlatL2(embedding_dim)
chunks_db = []

qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")

Device set to use cpu


In [None]:
PDF_PATH = "/content/drive/MyDrive/rag-api-local/THE_DECLARATION_OF_INDEPENDENCE_1776.pdf"

In [None]:
def extract_text_from_path(PDF_PATH):
    ext = PDF_PATH.split(".")[-1].lower()
    text = ""
    if ext == "pdf":
        with pdfplumber.open(PDF_PATH) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"
    elif ext == "docx":
        doc = docx.Document(PDF_PATH)
        for para in doc.paragraphs:
            text += para.text + "\n"
    elif ext == "txt":
        with open(PDF_PATH, "r", encoding="utf-8") as f:
            text = f.read()
    elif ext in ["jpg", "jpeg", "png"]:
        text = pytesseract.image_to_string(PDF_PATH)
    else:
        text = "Unsupported file type."
    return text

def chunk_text(text, chunk_size=300, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

def store_chunks(chunks, file_id):
    embeddings = model.encode(chunks)
    index.add(np.array(embeddings).astype("float32"))
    for i, chunk in enumerate(chunks):
        chunks_db.append({"file_id": file_id, "chunk": chunk})

def search_similar(query):
    q_embed = model.encode([query]).astype("float32")
    D, I = index.search(q_embed, k=3)
    results = [chunks_db[i]['chunk'] for i in I[0]]
    return "\n".join(results)

def ask_huggingface(context, question):
    prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"
    result = qa_pipeline(prompt, max_length=256, do_sample=False)
    return result[0]['generated_text']

In [None]:
text = extract_text_from_path(PDF_PATH)
chunks = chunk_text(text)
file_id = os.path.basename(PDF_PATH)
store_chunks(chunks, file_id)
print(f"Loaded and embedded: {file_id} with {len(chunks)} chunks")

  return forward_call(*args, **kwargs)


Loaded and embedded: THE_DECLARATION_OF_INDEPENDENCE_1776.pdf with 10 chunks


FastAPI App

In [84]:
!pip install fastapi pyngrok uvicorn python-multipart



In [85]:
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import JSONResponse
from fastapi.responses import Response

app = FastAPI()

@app.post("/upload")
async def upload_file(file: UploadFile = File("/content/drive/MyDrive/rag-api-local/THE_DECLARATION_OF_INDEPENDENCE_1776.pdf")):
    text = extract_text(file)
    chunks = chunk_text(text)
    file_id = file.filename
    store_chunks(chunks, file_id)
    return {"file_id": file_id, "chunks": len(chunks)}

@app.post("/query")
async def query(question: str = Form(...)):
    context = search_similar(question)
    answer = ask_huggingface(context, question)
    return JSONResponse(content={
        "question": question,
        "context": context,
        "answer": answer
    })

@app.get("/")
def root():
    return {"message": "RAG API running with HuggingFace LLM"}

@app.get("/favicon.ico")
async def favicon():
    return Response(content="", media_type="image/x-icon")

Run via ngrok

In [86]:
from google.colab import userdata

nest_asyncio.apply()

NGROK_AUTH_TOKEN = userdata.get('NGROK_AUTH_TOKEN')
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

public_url = ngrok.connect(8000)
print(f"API available at: {public_url}")

uvicorn.run(app, port=8000)

API available at: NgrokTunnel: "https://f2ff320c0f65.ngrok-free.app" -> "http://localhost:8000"


INFO:     Started server process [1044]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


INFO:     103.243.244.21:0 - "GET / HTTP/1.1" 200 OK
INFO:     103.243.244.21:0 - "GET /favicon.ico HTTP/1.1" 200 OK
INFO:     103.243.244.17:0 - "POST /upload HTTP/1.1" 422 Unprocessable Entity


  return forward_call(*args, **kwargs)
Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


INFO:     103.243.244.17:0 - "POST /query HTTP/1.1" 200 OK


Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


INFO:     103.243.244.17:0 - "POST /query HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [1044]
