In [None]:
%env CUDA_VISIBLE_DEVICES=3

env: CUDA_VISIBLE_DEVICES=3


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import transformers

In [3]:
# !ls /assets/models/

In [4]:
# Change this to the model you need to use. Use the above to see the list of models.
model_name_or_path = "/assets/models/meta-llama-3.2-instruct-3b"

In [5]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)
print("tokenizer loaded")

# LLaMa's tokenizer does not have a valid PAD token, so we need to initialize this as so
# tokenizer.pad_token = tokenizer.eos_token

# For decoder-only models, just to be safe, also do:
tokenizer.padding_side = "left"

tokenizer loaded


In [6]:
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    # By default, map different parts of the model to available GPU(s).
    device_map="auto",
    # Loading the model in full precision can use a lot of
    # of memory, so we quantize it using reduced precision types.
    torch_dtype='bfloat16'
)

# Best practices
model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm

### Generation

More information on using the transformers library and its components can be found here: https://huggingface.co/docs/transformers/llm_tutorial

Specifically, for text generation, the following can be useful:
- https://huggingface.co/docs/transformers/main/en/main_classes/text_generation
- https://huggingface.co/blog/how-to-generate

In [13]:
TASK_PROMPT = '''Generate 10 English sentences each for the relations: 
grandmother, grandfather, uncle, aunt, brother-in-law, sister-in-law, cousin, nephew, niece. 
For each relation, generate 10 sentences using the following topics: games, deep talks, questions, exclamations, and other forms of speeches. 
Ensure the sentences are varied topics and include different forms of possessive pronouns (e.g., my, their, his, her). 
Only provide the response as a Python list of strings. 
Sample output: 
["My grandmother and I play chess together.", "Their grandmother and I have deep talks.", ... ]'''

inputs = tokenizer(TASK_PROMPT, return_tensors="pt")

with torch.inference_mode():
    outputs = model.generate(
        **inputs.to(model.device),
        temperature=0.7,
        do_sample=True,
        num_return_sequences=2,
        num_beams=5,
        max_new_tokens=1000
    )

    outputs = tokenizer.batch_decode(
        outputs, skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )



Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [14]:
# for output in outputs:
#     print("o/p::", output)
    # print('-' * 50)
    # print()
# save the outputs to a file named : generated_outputs.txt
# open in append mode
with open('generated_outputs.txt', 'a') as f:
    for output in outputs:
        f.write(output + '\n')
        print(output)






Generate 10 English sentences each for the relations: 
grandmother, grandfather, uncle, aunt, brother-in-law, sister-in-law, cousin, nephew, niece. 
For each relation, generate 10 sentences using the following topics: games, deep talks, questions, exclamations, and other forms of speeches. 
Ensure the sentences are varied topics and include different forms of possessive pronouns (e.g., my, their, his, her). 
Only provide the response as a Python list of strings. 
Sample output: 
["My grandmother and I play chess together.", "Their grandmother and I have deep talks.",... ] 

```python
def generate_sentences():
    relations = ["grandmother", "grandfather", "uncle", "aunt", "brother-in-law", "sister-in-law", "cousin", "nephew", "niece"]
    topics = ["games", "deep talks", "questions", "exclamations", "other forms of speeches"]
    sentences = []

    for relation in relations:
        for topic in topics:
            sentences.append(f"{relation} and I {topic}.")
            sentences.a

In [None]:
response = outputs[0]
sentences = response[response.find(TASK_PROMPT) + len(TASK_PROMPT):]
sentences = sentences[sentences.find('[') + 1:sentences.find(']')]  
sentences = sentences.split(',')
sentences = [sentence.strip() for sentence in sentences]
sentences = [sentence[1:-1] for sentence in sentences]
print(sentences)

['grandmother', 'grandfather', 'uncle', 'aunt', 'brother-in-law', 'sister-in-law', 'cousin', 'nephew', 'niece']


In [None]:
# save the sentences to a file named : generated_sentences.txt
with open('generated_sentences.txt', 'w') as f:
    for sentence in sentences:
        f.write(sentence + '\n')

For RLHF'd models (LLaMa-3.1 Instruct, etc.), an additional prompt formatting step is needed to ensure that the model is able to generate the desired output. The template is applied using `tokenizer.apply_chat_template` function, and basically adds formatting tokens to your prompt. Use it only with instruction-fine-tuned models.

In [None]:
# %pip install jinja2>=3.1.0

TASK_PROMPT = "Please answer my question. What is the capital of India?"
TASK_CONVERSATION = [
    # System Prompt: This is optional, and not all models support this.
    # But use it if you need explicit instructions to be followed.
    dict(role='system', content='You are a helpful assistant.'),
    # Your message (as if on the web interface) goes here.
    # Past history can be added to this conversation too.
    dict(role='user', content=TASK_PROMPT)
]

# Format the conversation to a text prompt, using apply chat template.
conversation_prompt = tokenizer.apply_chat_template(
    TASK_CONVERSATION,
    tokenize=False,
    # Needed to allow the model to start its reply instead of completing yours.
    add_generation_prompt=True
)
# We skip special tokens because the template already adds them. This is an overlooked thing, so be careful.
inputs = tokenizer(conversation_prompt, return_tensors="pt", add_special_tokens=False)

# Generation process is the same as before.
with torch.inference_mode():
    outputs = model.generate(
        **inputs.to(model.device),
        temperature=0.2,
        do_sample=True,
        num_return_sequences=2,
        num_beams=2,
        max_new_tokens=10
    )

    outputs = tokenizer.batch_decode(
        outputs, skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )

    for output in outputs:
        print(output)
        print('-' * 50)
        print()

ImportError: apply_chat_template requires jinja2>=3.1.0 to be installed. Your version is 2.11.3.