In [1]:
from IPython.display import display, HTML
display(HTML('<style>.container { width:100% !important; }</style>'))

#### IMPORT PACKAGES

In [2]:
!nvidia-smi

Sun May 26 08:46:52 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.07             Driver Version: 535.161.07   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:C1:00.0 Off |                  Off |
| 30%   35C    P8              15W / 450W |      1MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, AwqConfig, AutoConfig

from huggingface_hub import login
from huggingface_hub import HfApi

from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_chroma import Chroma

from langchain_community.llms import VLLM
from vllm import LLM
from langchain.chains import RetrievalQA

from dotenv import load_dotenv
import logging
import warnings
import torch
import os

# Ignore warnings
warnings.filterwarnings('ignore')

# Configure logging to output to a file with a specific format and date format
logging.basicConfig(filename='logger.txt',
                    format='%(asctime)s - %(message)s',
                    datefmt='%d-%b-%y %H:%M:%S',
                    level=logging.INFO)

#### DEFINE CONSTANTS

In [4]:
class Constants:
    '''
    Class containing constants used throughout the project.
    '''
    pdf_dir = 'pdfs/'  # Path to the directory containing PDF files to be processed
    chunk_size = 2500  # Size of text chunks (in characters) for splitting the document
    chunk_overlap = 100  # Number of characters to overlap between adjacent text chunks
    return_k = 3  # Number of top results to return from vector similarity searches
    embedding_model = 'sentence-transformers/all-MiniLM-L6-v2'  # Name of the pre-trained sentence embedding model
    model_name = 'mistralai/Mistral-7B-Instruct-v0.2'  # Name of the base model for quantization
    model_save_path = './models'  # Path to save the base model
    quant_model_name = 'Mistral-7B-Instruct-v0.2-AWQ'  # Name of the AWQ quantized model
    quant_config = {'zero_point': True, 'q_group_size': 128, 'w_bit': 4, 'version': 'GEMM'}  # Configuration for AWQ quantization
    quant_save_path = f'./quantized/{quant_model_name}'  # Path to save the AWQ quantized model
    username = 'sharmapratik88'  # HuggingFace username
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Device to use for computations (GPU if available, else CPU)
    HF_TOKEN = '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'  # HuggingFace token (replace with your own token or a secure way of reading it)

#### LOAD DATA FROM PDF

In [5]:
def load_pdf_data():
    '''
    Loads and processes PDF documents from a specified directory. Each PDF is split into smaller chunks
    using a text splitter. The function logs various stages of this process.

    Returns:
        list: A list of document chunks obtained after splitting all the loaded PDF documents.
    '''
    logging.info('---------------------------')
    logging.info('Data Ingestion')

    # Create a list of PDF loaders for each PDF file in the specified directory
    loaders = [PyPDFLoader(os.path.join(Constants.pdf_dir, fn)) for fn in os.listdir(Constants.pdf_dir)]

    all_documents = []

    # Iterate through each PDF loader
    for loader in loaders:
        logging.info('Loading raw document: ' + loader.file_path.replace('../', ''))

        # Load the raw documents from the PDF file
        raw_documents = loader.load()

        logging.info('Splitting text .....')

        # Initialize the text splitter with the specified chunk size and overlap
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=Constants.chunk_size,
                                                       chunk_overlap=Constants.chunk_overlap)

        # Split the raw documents into chunks
        documents = text_splitter.split_documents(raw_documents)

        logging.info(f'Length of the PDF after chunking: {str(len(documents))}')

        # Add the chunks to the list of all documents
        all_documents.extend(documents)

    logging.info(f'Length of all documents after chunking: {str(len(all_documents))}')

    return all_documents

In [6]:
# Load the PDF documents
documents = load_pdf_data()
documents[0]

Document(page_content='Extending Llama-3’s Context Ten-Fold Overnight\nPeitian Zhang1,2, Ninglu Shao1,2, Zheng Liu1∗, Shitao Xiao1, Hongjin Qian1,2,\nQiwei Ye1, Zhicheng Dou2\n1Beijing Academy of Artificial Intelligence\n2Gaoling School of Artificial Intelligence, Renmin University of China\nnamespace.pt@gmail.com zhengliu1026@gmail.com\nAbstract\nWe extend the context length of Llama-3-8B-Instruct from 8K to 80K via QLoRA\nfine-tuning2. The entire training cycle is super efficient, which takes 8 hours on one\n8xA800 (80G) GPU machine. The resulted model exhibits superior performances\nacross a broad range of evaluation tasks, such as NIHS, topic retrieval, and long-\ncontext language understanding; meanwhile, it also well preserves the original\ncapability over short contexts. The dramatic context extension is mainly attributed\nto merely 3.5K synthetic training samples generated by GPT-4 , which indicates\nthe LLMs’ inherent (yet largely underestimated) potential to extend its origin

#### VECTOR DB

In [7]:
def embed_vector(docs):
    '''
    Embeds a list of documents using the SentenceTransformer model and stores the embeddings
    in a Chroma vector database. Sets up the Chroma DB to act as a retriever for similarity search.

    Args:
        docs (list): A list of documents to be embedded.

    Returns:
        retriever: A Chroma retriever object configured to search through the embedded documents.
    '''
    # Initialize the embeddings using the specified model
    embeddings = SentenceTransformerEmbeddings(model_name=Constants.embedding_model)

    # Initialize the vector DB (Chroma)
    logging.info('Initiating the Chroma DB vectorization step')

    # Create a Chroma vector DB from the documents
    vectordb = Chroma.from_documents(documents=docs, embedding=embeddings)

    # Set up the Chroma DB as a retriever with a specific number of results to return (k)
    retrieve = vectordb.as_retriever(search_kwargs={'k': Constants.return_k})

    return retrieve

In [8]:
retriever = embed_vector(documents)

#### AWQ QUANTIZATION

In [9]:
def quantize(docs):
    """
    Quantizes a specified language model from HuggingFace and saves it locally.
    The function also uploads the quantized model to the HuggingFace Hub.

    Args:
        docs (list): A list of document objects containing text data to be used for calibration during quantization.

    Steps:
        1. Load environment variables from a .env file.
        2. Log in to HuggingFace using the token from environment variables.
        3. Create necessary directories if they don't exist.
        4. Load the model and tokenizer from HuggingFace.
        5. Quantize the model using the provided calibration data.
        6. Save the quantized model and tokenizer locally.

    Returns:
        None
    """
    # Load environment variables from a .env file
    _ = load_dotenv()

    # HuggingFace login using the token from environment variables
    login(token=os.environ.get('HF_TOKEN'))

    # Create paths if they don't exist
    if not os.path.exists(Constants.quant_save_path):
        os.makedirs(Constants.quant_save_path)

    if not os.path.exists(Constants.model_save_path):
        os.makedirs(Constants.model_save_path)

    # Log the initialization of the specified model from HuggingFace
    logging.info(f'Loading {Constants.model_name} model from HuggingFace.')

    # Load the model from HuggingFace with specific configurations
    model = AutoAWQForCausalLM.from_pretrained(
        Constants.model_name,
        **{'low_cpu_mem_usage': True, 'use_cache': False},  # Use low CPU memory and disable cache
        cache_dir=Constants.model_save_path  # Specify the cache directory
    )

    # Load the tokenizer from HuggingFace with specific configurations
    tokenizer = AutoTokenizer.from_pretrained(
        Constants.model_name,
        trust_remote_code=True,  # Trust remote code for loading the tokenizer
        use_fast=True,  # Use the fast tokenizer implementation
        cache_dir=Constants.model_save_path  # Specify the cache directory
    )

    # Quantize the model using the specified configuration and calibration data
    model.quantize(tokenizer, quant_config=Constants.quant_config,
                   calib_data=[page.page_content for page in docs])

    # Save the quantized model and tokenizer to the specified path
    model.save_quantized(Constants.quant_save_path)
    tokenizer.save_pretrained(Constants.quant_save_path)
    logging.info('Quantized model and tokenizers saved to the path specified.')

In [10]:
# Perform the quantization process (assuming this modifies the model in place or sets up necessary files)
_ = quantize(documents)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

AWQ: 100%|██████████| 32/32 [09:43<00:00, 18.23s/it]


In [11]:
# Initialize the HuggingFace API with the token
api = HfApi(token=os.environ.get('HF_TOKEN'))

# Create a new repository on HuggingFace for the quantized model
api.create_repo(repo_id=f'{Constants.username}/{Constants.quant_model_name}', repo_type='model')

# Upload the quantized model folder to the HuggingFace repository
api.upload_folder(repo_id=f'{Constants.username}/{Constants.quant_model_name}',
                  folder_path=Constants.quant_save_path)

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.15G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/sharmapratik88/Mistral-7B-Instruct-v0.2-AWQ/commit/3b4e2ee94a959710b2502f104a71922038121e06', commit_message='Upload folder using huggingface_hub', commit_description='', oid='3b4e2ee94a959710b2502f104a71922038121e06', pr_url=None, pr_revision=None, pr_num=None)

#### LOAD THE QUANTIZED MODEL

In [12]:
torch.cuda.empty_cache()

In [13]:
# Set the environment variable 'CUDA_LAUNCH_BLOCKING' to '1' to help with debugging CUDA errors
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Construct the model ID using the quantized model name from the Constants module
model_id = f'sharmapratik88/{Constants.quant_model_name}'

# Initialize the vLLM model with specific parameters
llm_q = VLLM(
    model=model_id,  # The model ID to load from Hugging Face
    trust_remote_code=True,  # Allow execution of remote code for custom model loading
    max_new_tokens=4096,  # Maximum number of new tokens to generate
    temperature=0.9,  # Sampling temperature for text generation
    vllm_kwargs={
        'quantization': 'awq',  # Use AWQ quantization for the model
        'gpu_memory_utilization': 0.6  # Set GPU memory utilization to 60%
    },
)

INFO 05-26 08:58:38 llm_engine.py:100] Initializing an LLM engine (v0.4.2) with config: model='sharmapratik88/Mistral-7B-Instruct-v0.2-AWQ', speculative_config=None, tokenizer='sharmapratik88/Mistral-7B-Instruct-v0.2-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=sharmapratik88/Mistral-7B-Instruct-v0.2-AWQ)
INFO 05-26 08:58:38 utils.py:660] Found nccl from library /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 05-26 08:58:39 selector.py:27] Using FlashAttention-2 backend.
INFO 05-26 08:58:39 weight_utils.py:199] Using model weights format ['*.safetensors']
INFO 05-26 0

#### RETRIEVE RESPONSE FROM THE PDF(s)

In [14]:
# Create a RetrievalQA chain using the vLLM model and the retriever
qa_with_sources = RetrievalQA.from_chain_type(llm=llm_q, chain_type='stuff', retriever=retriever)

# Define the query to be asked to the LLM
query = 'What is Hybrid Search?'  # The question to be answered by the LLM
logging.info(f'Question to the LLM: {query}')  # Log the question being asked

# Get the response from the retrieval QA chain
response = qa_with_sources(query)['result']

# Log the answer received from the LLM
logging.info(f'Answer from the LLM: {response}')

# Return the response
response

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.18s/it]


'\nHybrid Search is a search methodology used for Retrieval-as-a-Service (RAG) systems that combines multiple search techniques to improve overall accuracy. In this study, the authors explored keyword-based similarity search, dense vector-based, and semantic-based sparse encoder-based search, and integrated these techniques to formulate hybrid queries. The goal is to elevate search capabilities and capture nuanced relationships between terms, thereby providing a more authentic representation of user intent and document relevance. The authors used the Sparse Encoder Model-based index with sparse encoder query + match query and combinations of multi match queries for their experiments.'