In [None]:
!pip install datasets
!pip install ptflops
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from torch.profiler import profile, record_function, ProfilerActivity
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np
import time

In [29]:
model_name = "EleutherAI/pythia-160m"
model = AutoModelForCausalLM.from_pretrained(model_name).cuda()
tokenizer = AutoTokenizer.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [30]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings((len(tokenizer) + 7) // 8 * 8)  # Ridimensiona a un multiplo di 8

dataset = load_dataset("ag_news", split="test[:1%]")
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 76
})

In [None]:
def preprocess(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

dataset = dataset.map(preprocess, batched=True)

input_ids_list = [torch.tensor(item['input_ids']) for item in dataset]
input_ids = torch.stack(input_ids_list).cuda()
attention_masks = torch.stack([torch.tensor(item['attention_mask']) for item in dataset]).cuda()

if we want to generate one output at a time

In [6]:
def generate_autoregressively(model, tokenizer, input_ids, attention_mask, max_new_tokens=50):
    model.eval()
    with torch.no_grad():
        start_time = time.time()
        generated_sequence = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.eos_token_id)
        generated_text = tokenizer.decode(generated_sequence[0], skip_special_tokens=True)
        end_time = time.time()
        execution_time = end_time - start_time
    return generated_text, execution_time

single_input_ids = input_ids[2].unsqueeze(0)  
single_attention_mask = attention_masks[2].unsqueeze(0)  
generated_text, single_execution_time = generate_autoregressively(model, tokenizer, single_input_ids, single_attention_mask, max_new_tokens=50)

print(f"Generated Text: {generated_text}")
print(f"Execution Time: {single_execution_time} seconds")

Generated Text: Ky. Company Wins Grant to Study Peptides (AP) AP - A company founded by a chemistry researcher at the University of Louisville won a grant to develop a method of producing better peptides, which are short chains of amino acids, the building blocks of proteins., The company is now working on a new method of producing peptides that is more efficient than the current method of producing peptides.

The company is also working on a new method of producing peptides that is more efficient than the current method of producing peptides
Execution Time: 0.6479079723358154 seconds


if we want to generate all the input at once, faster way of doing it

In [38]:
def generate_autoregressively(model, tokenizer, input_ids, attention_mask, max_new_tokens=50):
    model.eval()
    with torch.no_grad():
        start_time = time.time()
        generated_sequence = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.eos_token_id)
        end_time = time.time()
        execution_time = end_time - start_time
    return generated_sequence, execution_time

# Genera il testo e misura il tempo di esecuzione
generated_sequence, total_execution_time = generate_autoregressively(model, tokenizer, input_ids, attention_masks, max_new_tokens=50)

# Calcola il tempo medio di esecuzione per input
avg_execution_time = total_execution_time / input_ids.size(0)
print(f"Average Execution Time per Input: {avg_execution_time} seconds")


Average Execution Time per Input: 0.03136988690024928 seconds


code with also the flops, it seem to work correctly

In [None]:
!pip install deepspeed
import deepspeed

def get_flops(model, input_ids, attention_mask, max_new_tokens=50):
    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
        with record_function("model_inference"):
            model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.eos_token_id)

    flops = sum(evt.cpu_time_total for evt in prof.key_averages() if evt.key == "model_inference")
    return flops

In [11]:
def generate_and_profile(model, tokenizer, input_ids, attention_mask, max_new_tokens=50):
    model.eval()
    generated_sequences = []
    total_times = []
    flops_list = []
    with torch.no_grad():
        for i in range(input_ids.size(0)):
            start_time = time.time()
            input_sequence = input_ids[i:i+1]
            mask = attention_mask[i:i+1]
            generated_sequence = model.generate(
                input_sequence,
                attention_mask=mask,
                max_new_tokens=max_new_tokens,
                pad_token_id=tokenizer.eos_token_id
            )
            generated_text = tokenizer.decode(generated_sequence[0], skip_special_tokens=True)
            end_time = time.time()
            generated_sequences.append(generated_text)
            execution_time = end_time - start_time
            total_times.append(execution_time)

            # Calcola i FLOPs per il singolo input
            flops = get_flops(model, input_sequence, mask, max_new_tokens)
            flops_list.append(flops)

    return generated_sequences, total_times, flops_list

# Genera i testi e calcola i tempi di esecuzione e i FLOPs per l'intero dataset
generated_texts, total_times, flops_list = generate_and_profile(model, tokenizer, input_ids, attention_masks, max_new_tokens=20)

In [23]:
print(np.mean(total_times))
print(np.mean(flops_list))

0.35168942338541936
545270.052631579


fastest implementation to iterate over the examples and retrieve text and time

In [44]:

def generate_autoregressively(model, tokenizer, input_ids, attention_mask, max_new_tokens=20):
    model.eval()
    generated_sequences = []
    total_times = []
    with torch.no_grad():
        for i in range(input_ids.size(0)):
            start_time = time.time()
            input_sequence = input_ids[i:i+1]
            mask = attention_mask[i:i+1]
            generated_sequence = model.generate(
                input_sequence,
                attention_mask=mask,
                max_new_tokens=max_new_tokens,
                pad_token_id=tokenizer.eos_token_id
            )
            generated_text = tokenizer.decode(generated_sequence[0], skip_special_tokens=True)
            end_time = time.time()
            generated_sequences.append(generated_text)
            execution_time = end_time - start_time
            total_times.append(execution_time)
    return generated_sequences, total_times

# Genera i testi per l'intero dataset
generated_texts, times = generate_autoregressively(model, tokenizer, input_ids, attention_masks, max_new_tokens=20)

In [45]:
np.mean(times)

0.3361828327178955

In [9]:
model

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50280, 768)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-11): 12 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=768, out_features=2304, bias=True)
          (dense): Linear(in_features=768, out_features=768, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=768, out_features=3072, bias=True)
          (dense_4h_to_h): Linear(in_features=3072, out_features=768, bias=True)
          

One text

In [50]:
example = "Hello, I am Angelo, I am from Italy and"
input = tokenizer(example, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

input_ids = input['input_ids'].cuda()
attention_masks = input['attention_mask'].cuda()
generate_autoregressively(model, tokenizer, input_ids, attention_masks, max_new_tokens=20)

(['Hello, I am Angelo, I am from Italy and, I am from the United States. I am a student of the University of California, Berkeley.'],
 [0.5370926856994629])

Top-K