Install Libraries

In [5]:
!pip install gradio langchain langchain_groq langchain_community faiss-cpu Pillow pytesseract pdf2image PyPDF2 sentence_transformers groq tiktoken
!apt-get update
!apt-get install -y poppler-utils tesseract-ocr

Collecting langchain_groq
  Downloading langchain_groq-0.3.7-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting groq
  Downloading groq-0.31.0-py3-none-any.whl.metadata (16 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_c

Mount Google Drive

In [1]:
from google.colab import drive
import os

drive.mount('/content/drive')

drive_base = "/content/drive/MyDrive/BioChatbot"
pdf_folder = os.path.join(drive_base, "pdfs")
img_folder = os.path.join(drive_base, "pdf_images")
os.makedirs(pdf_folder, exist_ok=True)
os.makedirs(img_folder, exist_ok=True)

print(f" Data will be saved in Google Drive under: {drive_base}")

Mounted at /content/drive
 Data will be saved in Google Drive under: /content/drive/MyDrive/BioChatbot


Upload PDFs to Drive

In [2]:
from google.colab import files

uploaded = files.upload()

for filename, content in uploaded.items():
    # Destination in Drive
    destination_path = os.path.join(pdf_folder, filename)
    with open(destination_path, "wb") as f:
        f.write(content)
    print(f" Saved to Drive: {destination_path}")

print(" All PDFs saved permanently in Google Drive.")

 All PDFs saved permanently in Google Drive.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


OCR + Save Images

In [6]:
from pdf2image import convert_from_path
import pytesseract
from PyPDF2 import PdfReader
from multiprocessing import Pool, cpu_count

def process_page(args):
    pdf_path, filename, page_num = args
    try:
        # convert PDF page to image
        image = convert_from_path(pdf_path, dpi=150, first_page=page_num, last_page=page_num)[0]

        # save page image in drive
        image_path = os.path.join(img_folder, f"{filename}_page_{page_num}.png")
        image.save(image_path, "PNG")

        # OCR extract text
        text = pytesseract.image_to_string(image)
        return f"\n\n--- Page {page_num} of {filename} ---\n{text}"
    except Exception as e:
        return f"\n\n--- Page {page_num} of {filename} ---\nERROR: {e}"

def ocr_pdfs_in_drive(pdf_folder, output_file):
    tasks = []
    for filename in sorted(os.listdir(pdf_folder)):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            try:
                reader = PdfReader(pdf_path)
                total_pages = len(reader.pages)
                print(f" {filename} → {total_pages} pages")
                for page_num in range(1, total_pages + 1):
                    tasks.append((pdf_path, filename, page_num))
            except Exception as e:
                print(f" Could not read {filename}: {e}")

    with Pool(cpu_count()) as pool:
        for result in pool.imap_unordered(process_page, tasks):
            with open(output_file, "a", encoding="utf-8") as f:
                f.write(result)

    print(f" OCR completed. Output saved to {output_file}")

ocr_text_file = os.path.join(drive_base, "ocr_output.txt")
ocr_pdfs_in_drive(pdf_folder, ocr_text_file)

 Unit 01-Introduction to Biology-English.pdf → 5 pages
 Unit 02-Chemical and cellular basis of life-English.pdf → 66 pages
 Unit 03-Evolution and diversity of organisms-English.pdf → 35 pages
 Unit 04-Plant form and function-English.pdf → 49 pages
 Unit 05 Part 1-Animal form and function-English.pdf → 80 pages
 Unit 05 Part 2-Animal form and function-English.pdf → 88 pages
 Unit 06-Genetics-English.pdf → 43 pages
 Unit 07-Molecular Biology and Recombinant DNA Technology-English.pdf → 61 pages
 Unit 08-Environmental Biology-English.pdf → 43 pages
 Unit 09-Microbiology-English.pdf → 37 pages
 Unit 10-Applied Biology-English.pdf → 30 pages
 OCR completed. Output saved to /content/drive/MyDrive/BioChatbot/ocr_output.txt


Load OCR Text

In [7]:
with open(ocr_text_file, "r", encoding="utf-8") as f:
    ocr_text = f.read()

full_text_path = os.path.join(drive_base, "biology_full_content.txt")
with open(full_text_path, "w", encoding="utf-8") as f:
    f.write(ocr_text)

print(" Text saved in Google Drive.")

 Text saved in Google Drive.


Create FAISS Vector DB

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = splitter.create_documents([ocr_text])

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embedding_model)

# save FAISS index to drive
vectorstore.save_local(os.path.join(drive_base, "faiss_index"))

print(" FAISS index saved in Google Drive.")

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

 FAISS index saved in Google Drive.


Load Groq LLM

In [9]:
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA

os.environ["GROQ_API_KEY"] = "API_Key"
llm = ChatGroq(model_name="llama3-70b-8192", temperature=0.2)

qa = RetrievalQA.from_chain_type(llm=llm, retriever=vectorstore.as_retriever())

print(" Biology Chatbot Ready!")


 Biology Chatbot Ready!


Gradio Chat Interface

In [11]:
import gradio as gr

def chat_with_bot(user_input, history):
    try:
        answer = qa.run(user_input)
        return answer
    except Exception as e:
        return f" Error: {str(e)}"

demo = gr.ChatInterface(
    fn=chat_with_bot,
    title="A/L Biology Chatbot",
    description="Ask any A/L Biology Questions.",
    theme="default",
)

demo.launch(share=True)

  self.chatbot = Chatbot(


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://33f9bbefa8d176a3bd.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


