In [1]:
import os
import sys
import torch
from transformers import AutoTokenizer, GenerationConfig, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig

model_name = 'HuggingFaceH4/zephyr-7b-alpha'
tokenizer = AutoTokenizer.from_pretrained(model_name)

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    #quantization_config=bnb_config, # Закомментировать в случае использования небольших моделей, где не нужна квантизация
    device_map="auto", # В случае нескольких GPU нужно указать конкретную, cuda:0, например
    attn_implementation="sdpa", #attn_implementation='flash_attention_2' для GPU, которые поддерживают
)
model.eval()

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), e

In [2]:
from transformers import GenerationConfig

generation_config = GenerationConfig.from_pretrained(model_name)
generation_config

GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}

In [3]:
generation_config.temperature = 0.3
generation_config.max_new_tokens = 512
generation_config

GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "max_new_tokens": 512,
  "temperature": 0.3
}

In [4]:
print(tokenizer.apply_chat_template([{'role': 'user', 'content': 'Что такое ИИ?'}], tokenize=False, add_generation_prompt=True))

<|user|>
Что такое ИИ?</s>
<|assistant|>



In [5]:
print(tokenizer.chat_template)

{% for message in messages %}
{% if message['role'] == 'user' %}
{{ '<|user|>
' + message['content'] + eos_token }}
{% elif message['role'] == 'system' %}
{{ '<|system|>
' + message['content'] + eos_token }}
{% elif message['role'] == 'assistant' %}
{{ '<|assistant|>
'  + message['content'] + eos_token }}
{% endif %}
{% if loop.last and add_generation_prompt %}
{{ '<|assistant|>' }}
{% endif %}
{% endfor %}


In [6]:
prompts = [
    {'role': 'user', 'content': 'Объясни квантовую механику простыми словами'},
    {'role': 'user', 'content': 'Напиши короткий рассказ о космосе'},
    {'role': 'user', 'content': 'Составь рецепт борща'},
    {'role': 'user', 'content': 'Объясни как работает блокчейн'},
    {'role': 'user', 'content': 'Сделай краткое резюме романа "Преступление и наказание"'},
    {'role': 'user', 'content': 'Придумай сборку на акса в доте 2'},
    {'role': 'user', 'content': 'Напиши поздравление с днем рождения для Артура'},
    {'role': 'user', 'content': 'Объясни как работает градиентный спуск'},
    {'role': 'user', 'content': 'Сделай список идей для стартапа в сфере образования'},
    {'role': 'user', 'content': 'Напиши шутку про программистов'}
]

In [7]:
import torch

input_ids = tokenizer.apply_chat_template(
    [[prompt] for prompt in prompts],
    add_generation_prompt=True,
    tokenize=True,
    padding=True,
    return_tensors='pt'
)

input_ids = input_ids.to(model.device)
attention_mask = (input_ids != tokenizer.pad_token_id).int()

outputs = model.generate(
    input_ids,
    attention_mask=attention_mask,
    generation_config=generation_config
)

generated_responses = []
for i, output in enumerate(outputs):
    original_length = attention_mask[i].sum().item()
    generated_tokens = output[original_length:].detach().cpu()
    response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    generated_responses.append(response)

for i, (prompt, response) in enumerate(zip(prompts, generated_responses)):
    print(f"Промпт {i+1}: {prompt['content']}")
    print(f"Ответ {i+1}: {response}")
    print("-" * 50)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Промпт 1: Объясни квантовую механику простыми словами
Ответ 1: assistant|>
Quantum mechanics is a way of understanding the behavior of tiny particles, like atoms and subatomic particles, that is very different from the way we understand the behavior of everyday objects. In classical physics, we believe that particles have definite positions and velocities at all times, but in quantum mechanics, particles can exist in multiple places at once, and their behavior is probabilistic.

In quantum mechanics, particles are described by something called a wave function, which is a mathematical equation that tells us the probability of finding a particle in a particular location or with a particular property. The wave function can be thought of as a "cloud" of probabilities that surrounds the particle.

When we measure a particle, the wave function "collapses" to a single value, and we observe the particle in a particular state. But before the measurement, the particle can exist in multiple state