# Setup GIT and download latest code
Clone repository, setup GIT globals and checkout am branch

In [None]:
# Remove code folder
%cd /content/
!rm -rf /content/pdf-qa-bot

In [None]:
# Setup Repo
%cd /content/
!git clone https://github.com/safqore/pdf-qa-bot.git

%cd /content/pdf-qa-bot
!git config --global user.email "abbad.minhas@gmail.com"
!git config --global user.name "Abbad Minhas"
!git config pull.ff only

!git checkout am
!git status
!git fetch
!git pull

# Install Required Libraries
Use pip to install the necessary libraries, including langchain_community, langchain_text_splitters, langchain_huggingface, qdrant_client, and python-dotenv.

In [None]:
!pip install langchain_community \
    langchain_text_splitters \
    langchain_huggingface \
    qdrant_client \
    python-dotenv \
    pypdf

# Import Required Libraries
Import the necessary libraries, including logging, os, List from typing, load_dotenv from dotenv, PyPDFLoader, RecursiveCharacterTextSplitter, HuggingFaceEmbeddings, QdrantClient, VectorParams, and Distance.

In [None]:
# Import Required Libraries
import logging
import os
from typing import List
from dotenv import load_dotenv

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance

In [None]:
# Clear the existing environment variables
if "QDRANT_HOST" in os.environ:
    del os.environ["QDRANT_HOST"]

In [None]:
logging.basicConfig(level=logging.INFO)

# Load environment variables from .env file
load_dotenv('/content/pdf-qa-bot/.env')

# Define Helper Functions
Define the helper functions load_pdf, chunk_pdf, setup_vectors_collection, and update_vectors_db.

In [None]:
# Define Helper Functions

def load_pdf(str_pdf_path: str) -> List:
    return PyPDFLoader(str_pdf_path).load()

def chunk_pdf(str_pdf_content: str) -> List[str]:
    obj_text_splitter = RecursiveCharacterTextSplitter(chunk_size=384, chunk_overlap=76)
    return obj_text_splitter.split_text(str_pdf_content)

def setup_vectors_collection(db_client: QdrantClient, collection_name: str, dimension: int) -> None:
    if not db_client.collection_exists(collection_name):
        db_client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=dimension, distance=Distance.COSINE),
        )

def update_vectors_db(lst_chunks: List[str]) -> bool:
    try:
        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
        dimension = 768
        collection_name = "early_career_grants_programme"

        qdrant_client = QdrantClient(
            host=os.getenv("QDRANT_HOST"),
            api_key=os.getenv("QDRANT_API_KEY"),
        )
        setup_vectors_collection(qdrant_client, collection_name, dimension)

        vectors = [embeddings.embed_query(chunk) for chunk in lst_chunks]
        qdrant_client.upsert(
            collection_name=collection_name,
            points=[
                {"id": i, "vector": vector, "payload": {"text": lst_chunks[i]}}
                for i, vector in enumerate(vectors)
            ],
        )
        return True
    except Exception as e:
        logging.error(f"An error occurred: {e}")
        return False

# Main Execution
Execute the main function to load the PDF, chunk the content, and update the vectors database.

In [None]:
# Main Execution
logging.info("Loading RSTMH 2024 Early Career Grants Programme Terms and Conditions PDF ...")
lst_documents = load_pdf("/content/pdf-qa-bot/data/RSTMH 2024 Early Career Grants Programme Terms and Conditions PDF.pdf")

lst_pdf_content = [obj_document.page_content for obj_document in lst_documents]
str_pdf_content = " ".join(lst_pdf_content)

logging.info("Chunking PDF content for vector update / creation ...")
lst_chunks = chunk_pdf(str_pdf_content)

logging.info("Updating vectors database ...")
status = update_vectors_db(lst_chunks)
if status:
    logging.info("Vectors database updated successfully.")
else:
    logging.error("Failed to update vectors database.")