<a href="https://colab.research.google.com/github/sngo/llms-practice/blob/main/HF_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Looking at the lower level API of Transformers - the models that wrap PyTorch code for the transformers themselves.

In [None]:
!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate

In [2]:
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
import gc

In [3]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [4]:
# instruct models

LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"
PHI3 = "microsoft/Phi-3-mini-4k-instruct"
GEMMA2 = "google/gemma-2-2b-it"
QWEN2 = "Qwen/Qwen2-7B-Instruct" # exercise for you
DEEPSEEK = "deepseek-ai/deepseek-llm-7b-chat"
#Need one more model, not mistral, too big

In [5]:
messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Tell a light-hearted joke for a room of Data Scientists"}
  ]

In [6]:
# Quantization Config - this allows us to load the model into memory and use less memory

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
# Tokenizer

tokenizer = AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")

In [None]:
print(inputs)

In [None]:
# The model

model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config)

In [None]:
memory = model.get_memory_footprint() / 1e6
print(f"Memory footprint: {memory:,.1f} MB")

**Under The Hood**

In [None]:
model

In [None]:
#will take tokens as input and use model to generate output
outputs = model.generate(inputs, max_new_tokens=80)
print(tokenizer.decode(outputs[0]))
print('--------')
print(tokenizer.batch_decode(outputs))

In [35]:
#Clean up memory
def clean_up(model, inputs, outputs, tokenizer):
  del model, inputs, outputs, tokenizer
  torch.cuda.empty_cache()
  gc.collect()

clean_up(model, inputs, outputs, tokenizer)


In [36]:
# Wrapping everything in a function - and adding Streaming and generation prompts

def generate(model, messages):
  tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
  tokenizer.pad_token = tokenizer.eos_token
  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
  streamer = TextStreamer(tokenizer)
  #when gpu is not enough, use this: , llm_int8_enable_fp32_cpu_offload=True to offload to cpu
  model = AutoModelForCausalLM.from_pretrained(model, device_map="auto", quantization_config=quant_config, trust_remote_code=True, use_cache=False)
  #temperature: control creativity, top_p:Nucleus sampling for diversity, , temperature=0.7, top_p=0.9,do_sample=True
  outputs = model.generate(inputs, max_new_tokens=80, streamer=streamer)
  clean_up(model, inputs, outputs, tokenizer)

In [None]:
generate(LLAMA, messages)

In [None]:
generate(PHI3, messages)

In [None]:
#Gemma from Google
#In Gemma, no role as system
#Need to agree terms before use: https://huggingface.co/google/gemma-2-2b-it
gemma_messages = [
    {"role": "user", "content": "Tell a light-hearted joke for a room of Data Scientists"}
  ]
generate(GEMMA2, gemma_messages)

In [38]:
clean_up(model, inputs, outputs, tokenizer)

In [None]:
generate(DEEPSEEK, messages)