In [1]:
!pip install -q -U bitsandbytes
!!pip install -q -U accelerate
!pip install peft
!pip install datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
from datetime import datetime
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, PeftModel, get_peft_model
from datasets import load_dataset
import time
import psutil

# Define model and device
model_path = "microsoft/phi-2"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_path, model_max_length=512, padding_side="left", add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token

# Load SNLI dataset with specific sampling for NLI task
snli_dataset = load_dataset('snli')
train_dataset = snli_dataset['train'].select([i for i in range(0, 550000, 550)][:1000])
val_dataset = snli_dataset['validation'].select([i for i in range(0, 10000, 100)][:100])
test_dataset = snli_dataset['test'].select([i for i in range(0, 10000, 100)][:100])

def tokenize(batch):
    return tokenizer(batch['premise'], batch['hypothesis'], truncation=True, max_length=512, padding="max_length")

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Fine-tune using QLoRA
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head"],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

# Track trainable parameters
def print_trainable_parameters(model):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Trainable parameters: {trainable_params} || Total parameters: {total_params} || Percent trainable: {100 * trainable_params / total_params:.2f}%")

print_trainable_parameters(model)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # Shift the labels for the causal language modeling task
        labels = inputs.get("input_ids").clone()
        inputs["labels"] = labels
        inputs.pop("label", None)
        return super().compute_loss(model, inputs, return_outputs)

output_dir = "./snli_finetune_phi2"
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    learning_rate=2.5e-5,
    logging_steps=50,
    save_steps=len(train_dataset) // 2,
    eval_steps=50,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
    optim="paged_adamw_8bit",
    report_to='none'
)

# Mapping for SNLI label IDs to text labels
label_map = {0: "entailment", 1: "neutral", 2: "contradiction"}

# Define custom accuracy calculation for causal language model outputs
def compute_accuracy(model, dataset, tokenizer, device):
    model.eval()
    correct = 0
    total = len(dataset)

    for i, sample in enumerate(dataset):
        input_text = sample['premise'] + ' ' + sample['hypothesis']
        inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)
        target_label = label_map[sample['label']]  # Convert label ID to string label

        with torch.no_grad():
            output_ids = model.generate(
                inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_new_tokens=50,
                pad_token_id=tokenizer.eos_token_id
            )
            output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()

        # Compare generated text with the target label text
        if output_text == target_label:
            correct += 1

    accuracy = correct / total
    print(f"Accuracy: {accuracy * 100:.2f}%")
    return accuracy

# Record initial accuracy and resources
start_time = time.time()
cpu_memory = psutil.virtual_memory().used / (1024 ** 3)
gpu_memory = torch.cuda.memory_allocated() / (1024 ** 3) if torch.cuda.is_available() else None

# Initial evaluation
initial_accuracy = compute_accuracy(model, test_dataset, tokenizer, device)

# Fine-tune and evaluate
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

trainer.train()
end_time = time.time()

# Final evaluation after fine-tuning
fine_tuned_accuracy = compute_accuracy(model, test_dataset, tokenizer, device)
training_duration = end_time - start_time

# Save the final model
model.save_pretrained(f"{output_dir}/final_model")

# Print resources and final results
print(f"Training completed in {training_duration/60:.2f} minutes")
print(f"CPU Memory Used: {cpu_memory:.2f} GB")
if gpu_memory:
    print(f"GPU Memory Used: {gpu_memory:.2f} GB")
print(f"Initial Model Accuracy: {initial_accuracy * 100:.2f}%")
print(f"Fine-tuned Model Accuracy: {fine_tuned_accuracy * 100:.2f}%")

# Example failure cases
print("\nExample failure cases before fine-tuning:")
for i in range(3):
    sample = test_dataset[i]
    input_text = sample['premise'] + ' ' + sample['hypothesis']
    inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=50,
            pad_token_id=tokenizer.eos_token_id
        )
    print(f"Sample {i+1}:")
    print(f"Premise: {sample['premise']}")
    print(f"Hypothesis: {sample['hypothesis']}")
    print(f"Generated Label (Pre-trained): {tokenizer.decode(output[0], skip_special_tokens=True)}")

print("\nExample failure cases corrected by fine-tuned model:")
for i in range(3):
    sample = test_dataset[i]
    input_text = sample['premise'] + ' ' + sample['hypothesis']
    inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        output = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=50,
            pad_token_id=tokenizer.eos_token_id
        )
    print(f"Sample {i+1}:")
    print(f"Premise: {sample['premise']}")
    print(f"Hypothesis: {sample['hypothesis']}")
    print(f"Generated Label (Fine-tuned): {tokenizer.decode(output[0], skip_special_tokens=True)}")


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Trainable parameters: 4362240 || Total parameters: 1525754880 || Percent trainable: 0.29%




Accuracy: 0.00%


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,1.7018,0.209395
2,0.1815,0.173438
3,0.1603,0.157342
4,0.1494,0.153035
5,0.1487,0.152479


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Accuracy: 0.00%




Training completed in 89.99 minutes
CPU Memory Used: 2.74 GB
GPU Memory Used: 4.65 GB
Initial Model Accuracy: 0.00%
Fine-tuned Model Accuracy: 0.00%

Example failure cases before fine-tuning:
Sample 1:
Premise: This church choir sings to the masses as they sing joyous songs from the book at a church.
Hypothesis: The church has cracks in the ceiling.
Generated Label (Pre-trained): This church choir sings to the masses as they sing joyous songs from the book at a church. The church has cracks in the ceiling.A man is singing.A man is singing.A man is singing.A man is singing.A man is singing.A man is singing.A man is singing.A man is singing.A man is singing.A man is singing.
Sample 2:
Premise: A woman within an orchestra is playing a violin.
Hypothesis: A woman is playing the violin.
Generated Label (Pre-trained): A woman within an orchestra is playing a violin. A woman is playing the violin.A woman is playing the violin.A woman is playing the violin.A woman is playing the violin.A woman

In [5]:
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True


In [8]:
# Clear any unnecessary variables and clear cache
del base_model_path  # Remove base model if already loaded in another context
torch.cuda.empty_cache()  # Clear CUDA memory cache
