In [19]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [20]:
from google.colab import drive
drive.mount('/content/drive')
%cd "/content/drive/MyDrive/deep learning project/latest"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/deep learning project/latest


In [21]:
#!pip install -q transformers peft datasets torch bitsandbytes accelerate
!pip install -r requirements.txt
!pip install --upgrade transformers



In [22]:
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')

In [23]:
!pip install -q huggingface_hub
from huggingface_hub import login
login(token=hf_token)  # Will prompt for your Hugging Face token

In [24]:
import os
import torch
import yaml
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import wandb
from tqdm import tqdm

In [25]:
def load_config(config_path="config.yaml"):
    """Load configuration from YAML file"""
    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)
    return config

In [26]:
def prepare_tokenizer(config):
    """Prepare the tokenizer"""

    tokenizer = AutoTokenizer.from_pretrained(config['model']['name'])
    #tokenizer = AutoTokenizer.from_pretrained("./model")
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

In [27]:
def format_truthfulqa(example):
    question = example['question']

    # Use the first "correct" answer from the mc1_targets (label == 1)
    correct_choices = [
        choice for choice, label in zip(example["mc1_targets"]["choices"], example["mc1_targets"]["labels"]) if label == 1
    ]
    correct_answer = correct_choices[0] if correct_choices else ""

    return {"text": f"Question: {question}\nAnswer: {correct_answer}"}

Loading TruthfulQA dataset

In [28]:
def prepare_dataset(tokenizer, config):
    """Prepare and tokenize the dataset with train/validation split"""
    # Load TruthfulQA dataset
    dataset = load_dataset("truthful_qa", "multiple_choice")

    # Format the dataset
    formatted_dataset = dataset['validation'].map(format_truthfulqa)



    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            max_length=config['training']['max_length'],
            padding="max_length",
        )

    tokenized = formatted_dataset.map(tokenize_function, remove_columns=formatted_dataset.column_names, batched=True)
    return tokenized, dataset['validation']  # second one is raw for extracting choices

Load model

In [29]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch

In [30]:
def load_model(config):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )

    base_model = AutoModelForCausalLM.from_pretrained(
        config['model']['name'],
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )

    model = PeftModel.from_pretrained(base_model, "mistral-7b-triviaqa-lora_config3_v")

    #model.eval()
    return model


Evaluation with an example

In [31]:
def generate_answer(prompt, model, tokenizer, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            eos_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)




Evaluation of the model using BLEU and ROUGE metrics

In [32]:
!pip install nltk rouge_score



In [33]:
def extract_answer_letter(text):
    """
    Extract the answer letter (A, B, C, etc.) from the model's output.

    Args:
        text (str): Text containing the model's generated answer

    Returns:
        str: The extracted answer letter or empty string if not found
    """
    # Look for "Answer:" in the text
    answer_start = text.find("Answer:")
    if answer_start == -1:
        return ""

    # Get the text after "Answer:"
    answer_text = text[answer_start + len("Answer:"):].strip()

    # Try to extract the letter at the beginning of the answer
    letter_pattern = r"([A-Z])\."
    match = re.search(letter_pattern, answer_text)
    if match:
        return match.group(1)

    # If no letter with period found, try just the first letter
    if answer_text and answer_text[0].isalpha():
        return answer_text[0].upper()

    return ""



In [34]:
import evaluate
import re

In [35]:
def evaluate_model(model, tokenizer, validation_data_raw):
    rouge = evaluate.load("rouge")
    bleu = evaluate.load("bleu")

    generated = []
    references = []
    correct_mc = 0
    total_mc = 0

    for example in tqdm(validation_data_raw, desc="Evaluating"):
        #print(f"example: {example}")
        question = example["question"]
        #print(f"question is {question}")
        correct_index = example["mc1_targets"]["labels"].index(1)
        #print(f"correct_index is {correct_index}")
        choices = example["mc1_targets"]["choices"]
        #print(f"choices: {choices}")
        choice_letters = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q"]  # Adjust if more options possible

        # Format prompt
        prompt = f"Question: {question}\nChoices:\n"
        for i, choice in enumerate(choices):
            prompt += f"{choice_letters[i]}. {choice}\n"
        prompt += "Answer:"

        # Generate
        output = generate_answer(prompt, model, tokenizer)
        #print(f"output: {output}")

        pred_letter = extract_answer_letter(output)
        #print(f"pred_letter: {pred_letter}")


        # Check if first letter of model's answer matches the correct label

        correct_letter = choice_letters[correct_index]
        #print(f"correct_letter: {correct_letter}")
        if pred_letter == correct_letter:
            correct_mc += 1
        total_mc += 1
        try:
            pred_index = choice_letters.index(pred_letter) if pred_letter in choice_letters else -1
            generated.append(choices[pred_index] if pred_index >= 0 and pred_index < len(choices) else "")
        except ValueError:
            generated.append("")

        references.append(choices[correct_index])
    # Metrics
    rouge_score = rouge.compute(predictions=generated, references=references)
    bleu_score = bleu.compute(predictions=generated, references=[[ref] for ref in references])
    mc_accuracy = correct_mc / total_mc

    print(f"\n--- Evaluation Results ---")
    print(f"MC Accuracy: {mc_accuracy:.4f}")
    print(f"ROUGE-L: {rouge_score['rougeL']:.4f}")
    print(f"BLEU: {bleu_score['bleu']:.4f}")


In [None]:
config = load_config()
tokenizer = prepare_tokenizer(config)
model = load_model(config)
tokenized_validation, raw_validation = prepare_dataset(tokenizer, config)
evaluate_model(model, tokenizer, raw_validation)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/817 [00:00<?, ? examples/s]

Evaluating:   0%|          | 0/817 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   0%|          | 1/817 [00:06<1:30:29,  6.65s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   0%|          | 2/817 [00:13<1:30:47,  6.68s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   0%|          | 3/817 [00:19<1:30:22,  6.66s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   0%|          | 4/817 [00:26<1:30:19,  6.67s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   1%|          | 5/817 [00:33<1:30:20,  6.68s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   1%|          | 6/817 [00:40<1:30:04,  6.66s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   1%|          | 7/817 [00:46<1:29:35,  6.64s/it]Setting `pad_token_id` to `eos_token_id`:2 for