# Llama evaluation

In [1]:
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters()) / 10**6
    print(f'total_params: {total_params:.3f}M')

In [2]:
!git clone https://github.com/EleutherAI/lm-evaluation-harness

Cloning into 'lm-evaluation-harness'...
remote: Enumerating objects: 34143, done.[K
remote: Counting objects: 100% (252/252), done.[K
remote: Compressing objects: 100% (140/140), done.[K
remote: Total 34143 (delta 150), reused 189 (delta 109), pack-reused 33891[K
Receiving objects: 100% (34143/34143), 23.17 MiB | 21.97 MiB/s, done.
Resolving deltas: 100% (23884/23884), done.


In [3]:
cd lm-evaluation-harness

/kaggle/working/lm-evaluation-harness


In [4]:
!git checkout 115206dc89dad67b8beaa90051fb52db77f0a529

Note: switching to '115206dc89dad67b8beaa90051fb52db77f0a529'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 115206dc Merge pull request #1005 from Andrei-Aksionov/fix_indent_lm_eval_tasks_bigbench


In [5]:
!pip install -e .

Obtaining file:///kaggle/working/lm-evaluation-harness
  Preparing metadata (setup.py) ... [?25ldone
Collecting einops (from lm_eval==0.3.0)
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Collecting jsonlines (from lm_eval==0.3.0)
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting openai>=0.6.4 (from lm_eval==0.3.0)
  Downloading openai-1.30.4-py3-none-any.whl.metadata (21 kB)
Collecting omegaconf>=2.2 (from lm_eval==0.3.0)
  Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting peft>=0.2.0 (from lm_eval==0.3.0)
  Downloading peft-0.11.1-py3-none-any.whl.metadata (13 kB)
Collecting pycountry (from lm_eval==0.3.0)
  Downloading pycountry-23.12.11-py3-none-any.whl.metadata (12 kB)
Collecting pytablewriter (from lm_eval==0.3.0)
  Downloading pytablewriter-1.2.0-py3-none-any.whl.metadata (37 kB)
Collecting rouge-score>=0.0.4 (from lm_eval==0.3.0)
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [

In [None]:
pwd

In [None]:
# !pip install lm_eval

In [None]:
# !pip install evaluate

In [6]:
import json
import sys
from pathlib import Path
from typing import Dict, List, Literal, Optional

from lm_eval import evaluator, tasks

import numpy as np
import torch
from torch.nn import Linear, CrossEntropyLoss
from transformers import BertTokenizer, BertModel, default_data_collator, pipeline, AutoModelForSequenceClassification
from transformers import LlamaForCausalLM, LlamaTokenizer
from torch.utils.data import DataLoader
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

from lm_eval.base import BaseLM

2024-05-29 18:46:32.658211: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-29 18:46:32.658312: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-29 18:46:32.788170: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [7]:
llama = LlamaForCausalLM.from_pretrained("openlm-research/open_llama_3b_v2")

count_parameters(llama)

config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/6.85G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

total_params: 3426.474M


In [9]:
class ModelWrapper(BaseLM):
    def __init__(
        self,
        model, 
        batch_size,
        tokenizer,
        device
    ):
        super().__init__()
        self.model = model
        self.model.to(device)
        self.tokenizer = tokenizer
        self.batch_size_per_gpu = batch_size
        self.device_ = device
    
    @torch.inference_mode()
    def _model_call(self, inps):
        outputs = self.model(inps)
        if hasattr(outputs, 'logits'):
            return outputs.logits
        elif hasattr(outputs, 'last_hidden_state'):
            return outputs.last_hidden_state
        else:
            raise ValueError("Model output does not contain 'logits' or 'last_hidden_state'")
    
    @torch.inference_mode()
    def _model_generate(self, context, max_length, eos_token_id) -> torch.Tensor:
        # this only supports batch size 1
        assert context.shape[0] == 1
        out = generate(self.model, context[0], max_length, eos_id=eos_token_id)
        for block in self.model.transformer.h:
            block.attn.kv_cache.reset_parameters()
        return out.unsqueeze(0)
    
    @property
    def batch_size(self):
        return self.batch_size_per_gpu*2
    
    @property
    def device(self):
        return self.device_
    
    @property
    def eot_token_id(self):
        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
        return self.tokenizer.eos_id
    
    @property
    def max_gen_toks(self):
        return 256
    
    @property
    def max_length(self):
        return self.model.config.max_position_embeddings
    
    def tok_encode(self, string: str) -> List[int]:
        return self.tokenizer.encode(string)

    def tok_decode(self, tokens: List[int]) -> str:
        t = torch.tensor(tokens)
        return self.tokenizer.decode(t)

In [10]:
eval_tasks: List[str] = ['openbookqa','winogrande','hellaswag','boolq','arc_easy','piqa']

BATCH_SIZE = 2
num_fewshot = 0
limit = 256
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

wrapped_model = ModelWrapper(
    model=llama,
    batch_size=BATCH_SIZE,
    tokenizer=LlamaTokenizer.from_pretrained("openlm-research/open_llama_3b_v2"),
    device=device
)

tokenizer_config.json:   0%|          | 0.00/593 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/512k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/330 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [11]:
results = evaluator.evaluate(
    lm=wrapped_model,
    task_dict=tasks.get_task_dict(eval_tasks),
    num_fewshot=num_fewshot,
    limit=limit
)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/5.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.41k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/815k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16113 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3084 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1838 [00:00<?, ? examples/s]

Task: piqa; number of docs: 1838
Task: piqa; document 0; context prompt (starting on next line):
Question: Remove seeds from  strawberries
Answer:
(end of prompt on previous line)
Requests: [Req_loglikelihood('Question: Remove seeds from  strawberries\nAnswer:', ' Blend the strawberries, pour the mixture through a fine-mesh strainer with a bowl underneath to catch the pulps and strain out the seeds')[0]
, Req_loglikelihood('Question: Remove seeds from  strawberries\nAnswer:', ' Chop up the strawberries, pour the mixture through a fine-mesh strainer with a bowl underneath to catch the pulps and strain out the seeds')[0]
]
Running loglikelihood requests


100%|██████████| 2048/2048 [03:51<00:00,  8.83it/s]


In [12]:
print(results)

{'results': {'piqa': {'acc': 0.75390625, 'acc_stderr': 0.013467008897744992, 'acc_norm': 0.783203125, 'acc_norm_stderr': 0.012883263749490607}}, 'versions': {'piqa': 0}}


Выводы

Метод Inheritune:

Преимущества: Быстрое обучение, меньше данных для обучения.

Недостатки: Ограниченная гибкость архитектуры, зависит от качества и объема данных.

Метод дистилляции знаний:

Преимущества: Высокая точность, особенно при наличии большого объема данных.

Недостатки: Высокие вычислительные затраты, сложность настройки.

Когда использовать:

Inheritune: Подходит для случаев с ограниченными вычислительными ресурсами и данными. Идеально для быстрого прототипирования и ситуаций, когда необходима производительность на устройствах с ограниченными ресурсами.
Дистилляция знаний: Подходит для случаев, когда требуется высокая точность и есть доступ к большому объему данных и вычислительных ресурсов. Идеально для конечных моделей в производственных условиях.