# NLP CA4: `Hands-On`

In [None]:
# %%capture
!pip install -q accelerate peft bitsandbytes transformers trl sentencepiece triton

## Setting up

Create a Huggingface Access Token From:
https://huggingface.co/settings/tokens

You need to request for access to:
- ```google/gemma-2-2b-it```

In [None]:
# HF_TOKEN = 'XXX'
!hf auth login

In [None]:
import torch

DEVICE = "cpu"
if torch.backends.mps.is_available():
    DEVICE = "mps"
elif torch.cuda.is_available():
    DEVICE = "cuda"

print(f"Using device: {DEVICE}")

## Loading the LLM

In [None]:
BASE_MODEL = "google/gemma-2-2b"
INSTRUCT_MODEL = "google/gemma-2-2b-it"

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model and tokenizer
model_id = INSTRUCT_MODEL

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=DEVICE,
    dtype="float16",
)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

Tokenization inspection:

In [None]:
text = "Hello world! من یک دانشجو هستم."

encoded = tokenizer(text, return_tensors="pt")
print("Token IDs:", encoded["input_ids"][0])
print("Tokens:", tokenizer.convert_ids_to_tokens(encoded["input_ids"][0]))
print("Number of tokens:", len(encoded["input_ids"][0]))

Sea the structure of LLMs:

In [None]:
print(model)

## Inference model

The outputs of tokenizer are not human readable:

In [None]:
prompt = "Write a sentence with five words."

inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)

generation_output = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
)

generation_output

In [None]:
# Decode the output
print(tokenizer.decode(generation_output[0]))

### Base Model vs. Instruction-tuned Model

See the difference between Base and Instruct Models using the prompt ```What is 2+2?```, Keep in mind that when temperature != 0, you will get different answers.

In [None]:
# WRITE YOUR CODE HERE

### Speeding up generation by caching keys and values

In [None]:
prompt = "Write a very long email apologizing to Sarah for the tragic gardening mishap. Explain how it happened."

# Tokenize the input prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(DEVICE)

**Compare runtime:**

In [None]:
%%timeit -n 1
generation_output = model.generate(
  input_ids=input_ids,
  max_new_tokens=100,
  use_cache=True
)

In [None]:
%%timeit -n 1
generation_output = model.generate(
  input_ids=input_ids,
  max_new_tokens=100,
  use_cache=False
)

### Sampling

In [None]:
print(tokenizer.chat_template)

In [None]:
from transformers import pipeline

# Create a pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=500,
)

# Prompt
messages = [
    # {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Create a funny joke about chickens."}
]

In [None]:
# Greedy output generation
output = pipe(messages, do_sample=False)
print(output[0]["generated_text"])

**Compare different temperature:**

In [None]:
# Using a high temperature
output = pipe(messages, do_sample=True, temperature=1)
print(output[0]["generated_text"])

**Compare different top_p:**

In [None]:
# Using a high top_p
output = pipe(messages, do_sample=True, top_p=1)
print(output[0]["generated_text"])