In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import warnings
from collections import defaultdict

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Load the model and tokenizer
model_name = "Qwen/Qwen2.5-1.5B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Ensure pad_token is set

print("Special tokens map:", tokenizer.special_tokens_map)
print("All special tokens:", tokenizer.all_special_tokens)
print("All special token IDs:", tokenizer.all_special_ids)

# Get the model's default dtype
default_dtype = next(model.parameters()).dtype

# All message threads
pending_threads = [
    {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Tell me about the history of artificial intelligence."}
        ]
    },
    {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "What is the capital city of France?"}
        ]
    },
    {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Explain the theory of relativity."}
        ]
    },
    {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "How does the process of photosynthesis work?"}
        ]
    },
    {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "What is the tallest mountain in the world?"}
        ]
    },
    {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Who wrote 'To Kill a Mockingbird'?"}
        ]
    },
    {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "What is the speed of light?"}
        ]
    },
    {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Describe the process of evolution."}
        ]
    },
    {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "What is quantum computing?"}
        ]
    },
    {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Who was Albert Einstein?"}
        ]
    },
]

max_new_tokens = 50  # Maximum tokens to generate per sequence
max_batch_size = 4   # Maximum number of sequences in a batch
eos_token_id = tokenizer.eos_token_id

# Initialize lists
active_sequences = []
finished_sequences = []
sequence_id = 0  # Unique identifier for each sequence

# Get model configurations
num_layers = model.config.num_hidden_layers

# Adjusted for Grouped Query Attention (GQA)
if hasattr(model.config, "num_key_value_heads"):
    num_kv_heads = model.config.num_key_value_heads
elif hasattr(model.config, "num_key_value_groups"):
    num_kv_heads = model.config.num_key_value_groups
else:
    # Default to num_attention_heads if no GQA is used
    num_kv_heads = model.config.num_attention_heads

# Main loop
while pending_threads or active_sequences:
    # Fill up the batch with new message threads if we have space
    while len(active_sequences) < max_batch_size and pending_threads:
        thread = pending_threads.pop(0)
        messages = thread["messages"]
        print("\n\n---processing messages: ", messages)
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_input = tokenizer(text, return_tensors="pt")
        input_ids = model_input["input_ids"].to(model.device)  # Shape: [1, seq_len]
        attention_mask = model_input["attention_mask"].to(model.device)
        # Initialize position_ids
        position_ids = (attention_mask.cumsum(dim=1) - 1).clamp(min=0)

        # Initialize past_key_values as None for the sequence
        sequence = {
            "id": sequence_id,
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "position_ids": position_ids,
            "generated_ids": input_ids.clone(),
            "past_key_values": None,  # Initialize as None
            "finished": False,
            "max_length": input_ids.shape[1] + max_new_tokens,
            "prompt_length": input_ids.shape[1],  # Save the prompt length
        }
        active_sequences.append(sequence)
        sequence_id += 1
        print(f"Added sequence {sequence['id']} to active sequences")

    if not active_sequences:
        break  # No active sequences left to process

    # Separate sequences into initial and subsequent sequences
    initial_sequences = [seq for seq in active_sequences if seq["past_key_values"] is None]
    subsequent_sequences = [seq for seq in active_sequences if seq["past_key_values"] is not None]

    # Process initial sequences
    if initial_sequences:
        batch_input_ids = [seq["input_ids"].squeeze(0) for seq in initial_sequences]  # Remove batch dimension
        batch_attention_mask = [seq["attention_mask"].squeeze(0) for seq in initial_sequences]
        batch_position_ids = [seq["position_ids"].squeeze(0) for seq in initial_sequences]

        # Pad sequences to the same length
        batch_input_ids = torch.nn.utils.rnn.pad_sequence(batch_input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
        batch_attention_mask = torch.nn.utils.rnn.pad_sequence(batch_attention_mask, batch_first=True, padding_value=0)
        batch_position_ids = torch.nn.utils.rnn.pad_sequence(batch_position_ids, batch_first=True, padding_value=0)

        # Prepare model inputs
        model_inputs = {
            "input_ids": batch_input_ids,
            "attention_mask": batch_attention_mask,
            "position_ids": batch_position_ids,
            "use_cache": True,
        }

        # Forward pass
        print(f"Calling model for initial sequences...")
        outputs = model(**model_inputs)
        print("Model call completed.")

        logits = outputs.logits  # Shape: (batch_size, seq_length, vocab_size)
        new_past_key_values = outputs.past_key_values  # List of tuples per layer

        # Update each sequence
        for idx, seq in enumerate(initial_sequences):
            # Extract the key and value tensors for this sequence
            seq_past_key_values = []
            for layer_idx in range(num_layers):
                key = new_past_key_values[layer_idx][0][idx:idx+1]
                value = new_past_key_values[layer_idx][1][idx:idx+1]
                seq_past_key_values.append((key, value))
            seq["past_key_values"] = seq_past_key_values

            # Get next token logits (last token)
            next_token_logits = logits[idx, -1, :]

            # Apply greedy decoding
            next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)  # Shape: [1]
            next_token = next_token.to(seq["generated_ids"].device)

            # Update generated_ids
            seq["generated_ids"] = torch.cat([seq["generated_ids"], next_token.unsqueeze(0)], dim=1)
            # Update position_ids
            seq["position_ids"] = torch.cat([seq["position_ids"], seq["position_ids"][:, -1:] + 1], dim=1)

            # Check for EOS token or max length
            if next_token.item() == eos_token_id or seq["generated_ids"].shape[1] >= seq["max_length"]:
                seq["finished"] = True
                print(f"Sequence {seq['id']} finished generating.")
            else:
                print(f"Sequence {seq['id']} generated token id {next_token.item()}")

    # Process subsequent sequences
    if subsequent_sequences:
        # Group sequences by past_key_values seq_len
        seq_len_to_sequences = defaultdict(list)
        for seq in subsequent_sequences:
            seq_len = seq["past_key_values"][0][0].shape[2]  # seq_len dimension
            seq_len_to_sequences[seq_len].append(seq)

        # Process each group separately
        for seq_len, sequences in seq_len_to_sequences.items():
            batch_input_ids = []
            batch_attention_mask = []
            batch_position_ids = []
            batch_past_key_values = []
            next_position_ids_per_sequence = []  # Store per-sequence next_position_id

            # Collect inputs and past_key_values
            for idx, seq in enumerate(sequences):
                # Collect past_key_values for each layer
                for layer_idx in range(num_layers):
                    past_key, past_value = seq["past_key_values"][layer_idx]
                    if idx == 0:
                        # Initialize lists for this layer
                        batch_past_key_values.append([[], []])
                    batch_past_key_values[layer_idx][0].append(past_key)
                    batch_past_key_values[layer_idx][1].append(past_value)

                # Prepare input_ids, attention_mask, position_ids
                next_input_id = seq["generated_ids"][:, -1:]  # Shape: [1, 1]
                batch_input_ids.append(next_input_id)

                attention_mask = torch.ones_like(next_input_id, dtype=seq["attention_mask"].dtype)
                batch_attention_mask.append(attention_mask)

                next_position_id = seq["position_ids"][:, -1:] + 1  # Shape: [1, 1]
                batch_position_ids.append(next_position_id)
                # Store per-sequence next_position_id for later use
                next_position_ids_per_sequence.append(next_position_id)

            # Stack past_key_values for each layer
            for layer_idx in range(num_layers):
                keys = torch.cat(batch_past_key_values[layer_idx][0], dim=0)
                values = torch.cat(batch_past_key_values[layer_idx][1], dim=0)
                batch_past_key_values[layer_idx] = (keys, values)

            # Concatenate input tensors along batch dimension
            batch_input_ids = torch.cat(batch_input_ids, dim=0)
            batch_attention_mask = torch.cat(batch_attention_mask, dim=0)
            batch_position_ids = torch.cat(batch_position_ids, dim=0)

            # Prepare model inputs
            model_inputs = {
                "input_ids": batch_input_ids,
                "attention_mask": batch_attention_mask,
                "position_ids": batch_position_ids,
                "past_key_values": batch_past_key_values,
                "use_cache": True,
            }

            # Forward pass
            print(f"Processing sequences with past_key_values seq_len = {seq_len}")
            outputs = model(**model_inputs)
            print("Model call completed.")

            logits = outputs.logits  # Shape: (batch_size, seq_length, vocab_size)
            new_past_key_values = outputs.past_key_values  # List of tuples per layer

            # Update each sequence in the group
            for idx, seq in enumerate(sequences):
                # Extract the key and value tensors for this sequence
                seq_past_key_values = []
                for layer_idx in range(num_layers):
                    key = new_past_key_values[layer_idx][0][idx:idx+1]
                    value = new_past_key_values[layer_idx][1][idx:idx+1]
                    seq_past_key_values.append((key, value))
                seq["past_key_values"] = seq_past_key_values

                # Get next token logits (last token)
                next_token_logits = logits[idx, -1, :]

                # Apply any decoding strategy here (e.g., greedy)
                next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)  # Shape: [1]
                next_token = next_token.to(seq["generated_ids"].device)

                # Update generated_ids
                seq["generated_ids"] = torch.cat([seq["generated_ids"], next_token.unsqueeze(0)], dim=1)
                # Update position_ids using the stored next_position_id
                seq["position_ids"] = torch.cat([seq["position_ids"], next_position_ids_per_sequence[idx]], dim=1)

                # Check for EOS token or max length
                if next_token.item() == eos_token_id or seq["generated_ids"].shape[1] >= seq["max_length"]:
                    seq["finished"] = True
                    print(f"Sequence {seq['id']} finished generating.")
                else:
                    print(f"Sequence {seq['id']} generated token id {next_token.item()}")

    # Remove finished sequences and add to finished_sequences
    new_active_sequences = []
    for seq in active_sequences:
        if seq["finished"]:
            finished_sequences.append(seq)
        else:
            new_active_sequences.append(seq)
    active_sequences = new_active_sequences

    # Refill the batch after processing
    while len(active_sequences) < max_batch_size and pending_threads:
        thread = pending_threads.pop(0)
        messages = thread["messages"]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        print(f"Formatted input in working script:\n{text}\n")
        model_input = tokenizer(text, return_tensors="pt")
        input_ids = model_input["input_ids"].to(model.device)
        attention_mask = model_input["attention_mask"].to(model.device)
        # Initialize position_ids
        position_ids = (attention_mask.cumsum(dim=1) - 1).clamp(min=0)

        # Initialize past_key_values as None for the sequence
        sequence = {
            "id": sequence_id,
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "position_ids": position_ids,
            "generated_ids": input_ids.clone(),
            "past_key_values": None,  # Initialize as None
            "finished": False,
            "max_length": input_ids.shape[1] + max_new_tokens,
            "prompt_length": input_ids.shape[1],  # Save the prompt length
        }
        active_sequences.append(sequence)
        sequence_id += 1
        print(f"Added sequence {sequence['id']} to active sequences")

# Decode generated sequences
for seq in finished_sequences:
    generated_ids = seq["generated_ids"]
    generated_text = tokenizer.decode(generated_ids.squeeze(0), skip_special_tokens=True)
    print(f"\nFull Generated Text {seq['id']}:\n{generated_text}\n")

    # Extract only the assistant's response by removing the prompt
    response_ids = generated_ids[:, seq["prompt_length"]:]  # Remove the prompt tokens
    assistant_response = tokenizer.decode(response_ids.squeeze(0), skip_special_tokens=True)
    print(f"Assistant's Response {seq['id']}:\n{assistant_response}\n")

  from .autonotebook import tqdm as notebook_tqdm



Using device: cuda
GPU: NVIDIA L40S


Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:06<00:00,  1.06it/s]



Model Configuration:

Config attributes:
vocab_size: 152064
embedding_size: 152064
max_position_embeddings: 4096
hidden_size: 3584
intermediate_size: 37888
num_hidden_layers: 28
num_attention_heads: 28
layer_norm_eps: 1e-06
weight_tying: False
use_position_ids: True
attention_layer_norm: False
num_key_value_heads: 4
initializer_range: 0.02
use_cache: True
rope_theta: 1000000.0
clip_qkv: None
qkv_bias: True
norm_after: False
tie_word_embeddings: False
layer_norm_type: rms
return_dict: True
output_hidden_states: False
output_attentions: False
torchscript: False
torch_dtype: torch.float16
use_bfloat16: False
tf_legacy_loss: False
pruned_heads: {}
chunk_size_feed_forward: 0
is_encoder_decoder: False
is_decoder: False
cross_attention_hidden_size: None
add_cross_attention: False
tie_encoder_decoder: False
max_length: 20
min_length: 0
do_sample: False
early_stopping: False
num_beams: 1
num_beam_groups: 1
diversity_penalty: 0.0
temperature: 1.0
top_k: 50
top_p: 1.0
typical_p: 1.0
repetition_p

OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 0 has a total capacity of 44.31 GiB of which 41.00 MiB is free. Process 11287 has 24.34 GiB memory in use. Including non-PyTorch memory, this process has 19.91 GiB memory in use. Of the allocated memory 19.15 GiB is allocated by PyTorch, and 267.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)