## 1. Downloading the pre-trained model

In [None]:
from huggingface_hub import login
import os 

token = os.getenv("HUGGINGFACE_TOKEN")

login(token=token)

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "utter-project/EuroLLM-1.7B-Instruct",
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained("utter-project/EuroLLM-1.7B-Instruct")
tokenizer.pad_token = tokenizer.eos_token

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


## 2. Downloading and formating the dataset

In [17]:
from datasets import load_dataset

dataset = load_dataset("ai-forever/POLLUX", split="test[:20000]")

def format(example):
  text = f"<s>[INST] {example['instruction']}[/INST] {example['answer']}</s>"

  tokens = tokenizer(
    text,
    max_length=512,
    truncation=True,
    padding="max_length",
    return_tensors="pt"
  )

  labels = tokens['input_ids'].clone()

  inst_token_id = tokenizer.convert_tokens_to_ids("[/INST]")
  if inst_token_id in tokens["input_ids"][0]:
    end_idx = (tokens["input_ids"][0] == inst_token_id).nonzero(as_tuple=True)[0][-1].item()
    labels[0, :end_idx+1] = -100

  tokens['labels'] = labels

  return {k: v[0] for k, v in tokens.items()}

dataset = dataset.map(format, remove_columns=dataset.column_names)

Map: 100%|██████████| 20000/20000 [00:23<00:00, 846.91 examples/s]


## 3. Preparating for the finetuning

In [18]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=['q_proj', 'v_proj'],
    lora_dropout=0.05,
    bias='none',
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 11,010,048 || all params: 1,667,860,480 || trainable%: 0.6601


## 4. Train the model

In [None]:
from transformers import TrainingArguments
import torch

torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_math_sdp(True)

training_args = TrainingArguments(
    output_dir="./EURO_LLM_LORA",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    logging_steps=10,
    save_steps=100,
    num_train_epochs=3,
    fp16=True,
    save_total_limit=2,
    report_to="none",
)

from transformers import Trainer, DataCollatorForLanguageModeling

trainer = Trainer(
    model=model,
    train_dataset=dataset,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

trainer.train()

___

## 5. Test the model and print metrics

###### Uploading model weights

In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from safetensors.torch import load_file
from peft import get_peft_model, LoraConfig

lora_checkpoint = r"EURO_LLM_LORA\checkpoint-3750\model.safetensors"
base_model = "utter-project/EuroLLM-1.7B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"

lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=['q_proj', 'v_proj'],
    lora_dropout=0.05,
    bias='none',
    task_type="CAUSAL_LM"
)

model = AutoModelForCausalLM.from_pretrained(base_model)
model = get_peft_model(model, lora_config)
state_dict = load_file(lora_checkpoint)
model.load_state_dict(state_dict)
model = model.to(device)

tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token

###### Uploading dataset for test and function for generate answer by model

In [9]:
from datasets import load_dataset

dataset = load_dataset("ai-forever/POLLUX", split="test[:200]")

def generate_responses(batch, max_new_tokens=64):
    text = [f"<s>[INST] {example}[/INST]</s>" for example in batch['instruction']]

    tokens = tokenizer(
      text,
      truncation=True,
      padding=True,
      return_tensors="pt"
    ).to(model.device)
    
    with torch.no_grad():
      outputs = model.generate(**tokens, max_new_tokens=max_new_tokens, do_sample=False)

    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return {"predicted": decoded}

###### Get rouge, bleu scores and predictions

In [10]:
preds = dataset.map(generate_responses, batched=True, batch_size=8)

references = [a for a in preds['answer']]
predictions = [a for a in preds['predicted']]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Setting `pad_token_id` to `eos_token_id`:4 for open-end generation.
Map:   4%|▍         | 8/200 [00:04<01:49,  1.76 examples/s]Setting `pad_token_id` to `eos_token_id`:4 for open-end generation.
Map:   8%|▊         | 16/200 [00:11<02:19,  1.32 examples/s]Setting `pad_token_id` to `eos_token_id`:4 for open-end generation.
Map:  12%|█▏        | 24/200 [00:14<01:36,  1.83 examples/s]Setting `pad_token_id` to `eos_token_id`:4 for open-end generation.
Map:  16%|█▌        | 32/200 [00:20<01:49,  1.53 examples/s]Setting `pad_token_id` to `eos_token_id`:4 for open-end generation.
Map:  20%|██        | 40/200 [00:22<01:23,  1.91 examples/s]Setting `pad_token_id` to `eos_token_id`:4 for open-end generation.
Map:  24%|██▍       | 48/200 [00:35<02:11,  1.15 examples/s]Setting `pad_token_id` to `eos_token_id`:

###### Get some references and generated answers

In [11]:
import evaluate

rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

rouge_score = rouge.compute(predictions=predictions, references=references)
bleu_score = bleu.compute(predictions=predictions, references=references)

print("Rouge Score:", rouge_score)
print("Bleu Score:", bleu_score)

Rouge Score: {'rouge1': np.float64(0.12417919367405877), 'rouge2': np.float64(0.056647612097115566), 'rougeL': np.float64(0.11901776602841369), 'rougeLsum': np.float64(0.12329286965614666)}
Bleu Score: {'bleu': 0.0531382455687766, 'precisions': [0.20163888348360035, 0.07326659573082017, 0.04462279993893263, 0.032135112045738316], 'brevity_penalty': 0.7832546435533129, 'length_ratio': 0.8036663770634231, 'translation_length': 46251, 'reference_length': 57550}


In [7]:
import random
for i in random.sample(range(len(predictions)), 5):
    print(f"Instruction: {dataset[i]['instruction']}")
    print(f"True answer: {dataset[i]['answer']}")
    print(f"Generated answer: {predictions[i]}")
    print("_" * 50)

Instruction: Составь мне план научного доклада об измерении содержания метана в испарениях над морем Лаптевых.
True answer: Ниже приведён примерный план научного доклада на тему «Измерение содержания метана в испарениях над морем Лаптевых». Каждый раздел может быть дополнен более детальной информацией или статистическими данными в зависимости от объёма работы и целей исследования.

1. Введение  
   1.1. Актуальность исследования  
       – Краткое описание глобальных климатических изменений и роли метана как парникового газа.  
       – Обоснование важности изучения выбросов метана в арктических регионах.  
   1.2. Цели и задачи  
       – Общая цель: получить данные о концентрациях метана в испарениях над морем Лаптевых и выявить источники выбросов.  
       – Конкретные задачи (например, усовершенствование методологии измерения, определение влияющих факторов и т.п.).  

2. Обзор литературы и теоретические предпосылки  
   2.1. Метан как парниковый газ  
       – Основные характеристи

___