In [1]:
import os; os.chdir("../")
import sys; sys.path.append("scripts")

In [2]:
import re
from utils import _extract_answer

In [28]:
def evaluate(log_file_path):
    # 로그 데이터를 저장할 리스트
    parsed_data = []

    # 정규식을 사용하여 Question, Prediction, Answer를 파싱
    pattern_iteration_start = re.compile(r'Starting Evaluate script')
    pattern_question = re.compile(r'Question:\s*\[(.+?)\](?=\s*\[|$)', re.DOTALL)
    pattern_prediction = re.compile(r'Prediction:\s*\[(.+?)\](?=\s*\[|$)', re.DOTALL)
    pattern_answer = re.compile(r'Answer:\s*\[(.+?)\](?=\s*\[|$)', re.DOTALL)

    # 로그 파일 읽기 (예: 'log.txt'로 저장된 경우)
    with open(log_file_path, "r") as log_file:
        current_entry = {"question": None, "prediction": None, "answer": None}
        # 최근 iteration 데이터를 저장할 변수
        current_iteration_data = []
        for line in log_file:
            # Iteration 시작 감지
            if pattern_iteration_start.search(line):
                # 기존 iteration 데이터를 가장 최근 데이터로 갱신
                current_iteration_data = []
            current_iteration_data.append(line)

        for line in current_iteration_data:
            question_match = pattern_question.search(line)
            if question_match:
                current_entry["question"] = question_match.group(1).strip("[]").strip()
                continue

            # Prediction 파싱
            prediction_match = pattern_prediction.search(line)
            if prediction_match:
                current_entry["prediction"] = prediction_match.group(1).strip("[]").strip()
                continue

            # Answer 파싱
            answer_match = pattern_answer.search(line)
            if answer_match:
                current_entry["answer"] = answer_match.group(1).strip("[]").strip()
                # 완전한 데이터가 준비되면 저장
                if current_entry["question"] and current_entry["prediction"] and current_entry["answer"]:
                    parsed_data.append(current_entry)
                    current_entry = {"question": None, "prediction": None, "answer": None}

    # 결과 출력
    res = []
    for i, entry in enumerate(parsed_data):
        try:
            pn, gn = _extract_answer(entry["prediction"], entry["answer"])
            res.append({
                "question": entry["question"],
                "prediction": pn,
                "answer": gn,
                "correct": pn == gn
            })
        except Exception as e:
            print(f"Error in {i}: {e}")
            print(entry['answer'])
            print()


        # if i > 10: break
    return res


for model in ["1b", "8b"]:
    res = evaluate(f"logs/eval-{model}.log")
    score = sum([1 for r in res if r["correct"]]) / len(res)
    print("Length of res:", len(res))
    print(f"Score of {model}: {score*100:.2f}%")

Length of res: 302
Score of 1b: 12.91%
Length of res: 100
Score of 8b: 47.00%


In [None]:
STOP

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_from_disk
import evaluate
from torch.utils.data import DataLoader

In [None]:
tokenized_dataset = load_from_disk("gsm8k_tokenized")

In [None]:
# Load model directly
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
config = BitsAndBytesConfig(
    load_in_8bit=True,  # Load model weights in 8-bit format
)
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-8B",
    device_map="auto",  # Automatically maps layers across GPUs/CPUs
    torch_dtype=torch.float16,  # Half precision for faster computations
    quantization_config=config

)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
model.eval()
metric = evaluate.load("exact_match")  # For example, this could also be "bleu" or custom metric


def evaluate_model(batch):
    # Move the input_ids and attention_mask to the correct device (e.g., GPU if available)
    input_ids = torch.tensor(batch['input_ids']).to(model.device)
    attention_mask = torch.tensor(batch['attention_mask']).to(model.device)

    # Shift the inputs for the causal LM objective (predict next token)
    labels = input_ids.clone()
    
    # Pass the input_ids through the model
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        loss = outputs.loss.item()
    
    # Compute predictions: get the most likely token (argmax over logits)
    predictions = torch.argmax(logits, dim=-1)
    
    # Detokenize the predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Return loss and other metrics (e.g., Exact Match)
    exact_match = metric.compute(predictions=decoded_preds, references=decoded_labels)
    
    return {
        "loss": loss,
        "exact_match": exact_match["exact_match"],
    }

In [None]:
data_loader = DataLoader(tokenized_dataset['test'], batch_size=8, shuffle=True)

In [None]:
next(iter(data_loader))

{'question': ["Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?",
  'A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?',
  'Josh decides to try flipping a house.  He buys a house for $80,000 and then puts in $50,000 in repairs.  This increased the value of the house by 150%.  How much profit did he make?',
  'James decides to run 3 sprints 3 times a week.  He runs 60 meters each sprint.  How many total meters does he run a week?',
  "Every day, Wendi feeds each of her chickens three cups of mixed chicken feed, containing seeds, mealworms and vegetables to help keep them healthy.  She gives the chickens their feed in three separate meals. In the morning, she gives her flock of chickens 15 cups of feed.  

In [None]:
# Get the DataLoader
data_loader = DataLoader(tokenized_dataset['test'], batch_size=8)

# Evaluate on a batch-by-batch basis
results = []
for batch in data_loader:
    results.append(evaluate_model(batch))

# Compute the average loss and exact match
average_loss = torch.tensor([x["loss"] for x in results]).mean().item()
average_exact_match = torch.tensor([x["exact_match"] for x in results]).mean().item()

print(f"Average loss: {average_loss}")
print(f"Average Exact Match: {average_exact_match}")


131072

In [None]:
# Evaluate on a batch-by-batch basis
results = []
for batch in tokenized_dataset['test']:  # Or use 'train' for the training set
    result = evaluate_model(batch)
    results.append(result)

# Summarize final results
avg_loss = sum([r["loss"] for r in results]) / len(results)
avg_exact_match = sum([r["exact_match"] for r in results]) / len(results)

print(f"Average Loss: {avg_loss}")
print(f"Average Exact Match: {avg_exact_match}")

OutOfMemoryError: CUDA out of memory. Tried to allocate 131072.00 GiB. GPU 0 has a total capacity of 23.87 GiB of which 12.83 GiB is free. Including non-PyTorch memory, this process has 11.04 GiB memory in use. Of the allocated memory 10.47 GiB is allocated by PyTorch, and 41.19 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)