In [None]:
!pip install -qU bitsandbytes accelerate

In [None]:
import torch
from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline, )
from langchain import HuggingFacePipeline

In [None]:
model_name = "HuggingFaceH4/zephyr-7b-beta"

In [None]:
# function for laoding 4-bit quantized model
def load_quantized_model(model_name:str):
  """
  model_name: Name or path of the model to be loaded.
  return: Loaded quantized model.
  """
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=torch.bfloat16,
  )
  model = AutoModelForCausalLM.from_pretrained(
      model_name,
      torch_dtype=torch.bfloat16,
      quantization_config=bnb_config,
  )
  return model

In [None]:
# Initializing tokenizer
def initialize_tokenizer(model_name:str):
  """
  model_name: Name or path of the model to be loaded.
  return: Tokenizer for the model.
  """
  tokenizer = AutoTokenizer.from_pretrained(model_name, return_token_type_ids=False)
  tokenizer.bos_token_id = 1 # Set beginning of sentence token id
  return tokenizer

In [None]:
tokenizer = initialize_tokenizer(model_name)

In [None]:
model = load_quantized_model(model_name)