In [1]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, AwqConfig, AutoConfig

from huggingface_hub import login
from huggingface_hub import HfApi

from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_chroma import Chroma

from langchain_community.llms import VLLM
from vllm import LLM
from langchain.chains import RetrievalQA

from dotenv import load_dotenv
import logging
import warnings
import torch
import os

# Ignore warnings
warnings.filterwarnings('ignore')

# Configure logging to output to a file with a specific format and date format
logging.basicConfig(filename='logger.txt',
                    format='%(asctime)s - %(message)s',
                    datefmt='%d-%b-%y %H:%M:%S',
                    level=logging.INFO)

INFO 10-23 12:41:43 importing.py:10] Triton not installed; certain GPU-related functions will not be available.


In [2]:
class Constants:
    '''
    Class containing constants used throughout the project.
    '''
    pdf_dir = 'pdfs/'  # Path to the directory containing PDF files to be processed
    chunk_size = 2500  # Size of text chunks (in characters) for splitting the document
    chunk_overlap = 100  # Number of characters to overlap between adjacent text chunks
    return_k = 3  # Number of top results to return from vector similarity searches
    embedding_model = 'sentence-transformers/all-MiniLM-L6-v2'  # Name of the pre-trained sentence embedding model
    model_name = 'mistralai/Mistral-7B-Instruct-v0.2'  # Name of the base model for quantization
    model_save_path = './models'  # Path to save the base model
    quant_model_name = 'Mistral-7B-Instruct-v0.2-GGUF'  # Name of the AWQ quantized model
    quant_config = {'zero_point': True, 'q_group_size': 128, 'w_bit': 4, 'version': 'GEMM'}  # Configuration for AWQ quantization
    quant_save_path = f'./quantized/{quant_model_name}'  # Path to save the AWQ quantized model
    username = 'sharmapratik88'  # HuggingFace username
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Device to use for computations (GPU if available, else CPU)
    HF_TOKEN = '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'  # HuggingFace token (replace with your own token or a secure way of reading it)

In [3]:
def load_pdf_data():
    '''
    Loads and processes PDF documents from a specified directory. Each PDF is split into smaller chunks
    using a text splitter. The function logs various stages of this process.

    Returns:
        list: A list of document chunks obtained after splitting all the loaded PDF documents.
    '''
    logging.info('---------------------------')
    logging.info('Data Ingestion')

    # Create a list of PDF loaders for each PDF file in the specified directory
    loaders = [PyPDFLoader(os.path.join(Constants.pdf_dir, fn)) for fn in os.listdir(Constants.pdf_dir)]

    all_documents = []

    # Iterate through each PDF loader
    for loader in loaders:
        logging.info('Loading raw document: ' + loader.file_path.replace('../', ''))

        # Load the raw documents from the PDF file
        raw_documents = loader.load()

        logging.info('Splitting text .....')

        # Initialize the text splitter with the specified chunk size and overlap
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=Constants.chunk_size,
                                                       chunk_overlap=Constants.chunk_overlap)

        # Split the raw documents into chunks
        documents = text_splitter.split_documents(raw_documents)

        logging.info(f'Length of the PDF after chunking: {str(len(documents))}')

        # Add the chunks to the list of all documents
        all_documents.extend(documents)

    logging.info(f'Length of all documents after chunking: {str(len(all_documents))}')

    return all_documents

In [4]:
# Load the PDF documents
documents = load_pdf_data()
documents[0]

Document(metadata={'source': 'pdfs/Llama3 paper.pdf', 'page': 0}, page_content='Extending Llama-3’s Context Ten-Fold Overnight\nPeitian Zhang1,2, Ninglu Shao1,2, Zheng Liu1∗, Shitao Xiao1, Hongjin Qian1,2,\nQiwei Ye1, Zhicheng Dou2\n1Beijing Academy of Artificial Intelligence\n2Gaoling School of Artificial Intelligence, Renmin University of China\nnamespace.pt@gmail.com zhengliu1026@gmail.com\nAbstract\nWe extend the context length of Llama-3-8B-Instruct from 8K to 80K via QLoRA\nfine-tuning2. The entire training cycle is super efficient, which takes 8 hours on one\n8xA800 (80G) GPU machine. The resulted model exhibits superior performances\nacross a broad range of evaluation tasks, such as NIHS, topic retrieval, and long-\ncontext language understanding; meanwhile, it also well preserves the original\ncapability over short contexts. The dramatic context extension is mainly attributed\nto merely 3.5K synthetic training samples generated by GPT-4 , which indicates\nthe LLMs’ inherent (y

In [5]:
def embed_vector(docs):
    '''
    Embeds a list of documents using the SentenceTransformer model and stores the embeddings
    in a Chroma vector database. Sets up the Chroma DB to act as a retriever for similarity search.

    Args:
        docs (list): A list of documents to be embedded.

    Returns:
        retriever: A Chroma retriever object configured to search through the embedded documents.
    '''
    # Initialize the embeddings using the specified model
    embeddings = SentenceTransformerEmbeddings(model_name=Constants.embedding_model)

    # Initialize the vector DB (Chroma)
    logging.info('Initiating the Chroma DB vectorization step')

    # Create a Chroma vector DB from the documents
    vectordb = Chroma.from_documents(documents=docs, embedding=embeddings)

    # Set up the Chroma DB as a retriever with a specific number of results to return (k)
    retrieve = vectordb.as_retriever(search_kwargs={'k': Constants.return_k})

    return retrieve

In [6]:
retriever = embed_vector(documents)

In [7]:
def quantize(docs):
    """
    Quantizes a specified language model from HuggingFace and saves it locally.
    The function also uploads the quantized model to the HuggingFace Hub.

    Args:
        docs (list): A list of document objects containing text data to be used for calibration during quantization.

    Steps:
        1. Load environment variables from a .env file.
        2. Log in to HuggingFace using the token from environment variables.
        3. Create necessary directories if they don't exist.
        4. Load the model and tokenizer from HuggingFace.
        5. Quantize the model using the provided calibration data.
        6. Save the quantized model and tokenizer locally.

    Returns:
        None
    """
    # Load environment variables from a .env file
    _ = load_dotenv()

    # HuggingFace login using the token from environment variables
    login(token=os.environ.get('HF_TOKEN'))

    # Create paths if they don't exist
    if not os.path.exists(Constants.quant_save_path):
        os.makedirs(Constants.quant_save_path)

    if not os.path.exists(Constants.model_save_path):
        os.makedirs(Constants.model_save_path)

    # Log the initialization of the specified model from HuggingFace
    logging.info(f'Loading {Constants.model_name} model from HuggingFace.')

    # Load the model from HuggingFace with specific configurations
    model = AutoModelForCausalLM.from_pretrained(
        Constants.model_name,
        **{'low_cpu_mem_usage': True, 'use_cache': False},  # Use low CPU memory and disable cache
        cache_dir=Constants.model_save_path  # Specify the cache directory
    )

    # Load the tokenizer from HuggingFace with specific configurations
    tokenizer = AutoTokenizer.from_pretrained(
        Constants.model_name,
        trust_remote_code=True,  # Trust remote code for loading the tokenizer
        use_fast=True,  # Use the fast tokenizer implementation
        cache_dir=Constants.model_save_path  # Specify the cache directory
    )

    # Quantize the model using the specified configuration and calibration data
    model.quantize(tokenizer, quant_config=Constants.quant_config,
                   calib_data=[page.page_content for page in docs])

    # Save the quantized model and tokenizer to the specified path
    model.save_quantized(Constants.quant_save_path)
    tokenizer.save_pretrained(Constants.quant_save_path)
    logging.info('Quantized model and tokenizers saved to the path specified.')

In [8]:
# Perform the quantization process (assuming this modifies the model in place or sets up necessary files)
from transformers import AutoModelForCausalLM, AutoTokenizer
_ = quantize(documents)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2.
401 Client Error. (Request ID: Root=1-6718a1d5-529312e35f24e5722a0fc44e;b247b507-c0bf-4d35-adec-9f97d4458ad6)

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.2 is restricted. You must have access to it and be authenticated to access it. Please log in.

In [32]:
# pip install ipywidgets