In [2]:
!pip install -q bitsandbytes evaluate transformers peft accelerate datasets scipy einops evaluate trl

In [3]:
import torch
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, Dataset, DatasetDict
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from functools import partial
import time

model_id = "microsoft/phi-2"
huggingface_dataset_name = "snli"
data = load_dataset(huggingface_dataset_name)

train_samples = Dataset.from_dict(data['train'].select(range(0, 550152, 550))[:1000])
test_samples = Dataset.from_dict(data['test'].select(range(0, 10000, 100))[:100])
validation_samples = Dataset.from_dict(data['validation'].select(range(0, 10000, 100))[:100])
train_samples = train_samples.filter(lambda x: x['label'] != -1)
test_samples = test_samples.filter(lambda x: x['label'] != -1)
validation_samples = validation_samples.filter(lambda x: x['label'] != -1)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    num_labels=3,
    device_map='auto'
)
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left", use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/412k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/413k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/550152 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of PhiForSequenceClassification were not initialized from the model checkpoint at microsoft/phi-2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

In [4]:
import torch
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
from peft import LoraConfig, get_peft_model
peft_config = LoraConfig(
    r=16,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none"
)

model = get_peft_model(model, peft_config)

def tokenize_samples(data):
    tokenized_inputs = tokenizer(
        data['premise'],
        data['hypothesis'],
        truncation=True,
        padding='max_length',
        max_length=128
    )
    
    tokenized_inputs['labels'] = data['label']
    return tokenized_inputs

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

tokenized_train_samples = tokenize_samples(train_samples)
tokenized_val_samples = tokenize_samples(validation_samples)
tokenized_test_samples = tokenize_samples(test_samples)

In [5]:
tokenized_train_samples['labels'] = torch.tensor(tokenized_train_samples['labels'], dtype=torch.long).to(device)
tokenized_val_samples['labels'] = torch.tensor(tokenized_val_samples['labels'], dtype=torch.long).to(device)
tokenized_test_samples['labels'] = torch.tensor(tokenized_test_samples['labels'], dtype=torch.long).to(device)
tokenized_train_samples['labels'] = tokenized_train_samples['labels'].to(torch.long)
tokenized_val_samples['labels'] = tokenized_val_samples['labels'].to(torch.long)
tokenized_test_samples['labels'] = tokenized_test_samples['labels'].to(torch.long)


train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(tokenized_train_samples['input_ids'], dtype=torch.long).to(device),
    torch.tensor(tokenized_train_samples['attention_mask'], dtype=torch.long).to(device),
    torch.tensor(tokenized_train_samples['labels'], dtype=torch.long).to(device)
)

val_dataset = torch.utils.data.TensorDataset(
    torch.tensor(tokenized_val_samples['input_ids'], dtype=torch.long).to(device),
    torch.tensor(tokenized_val_samples['attention_mask'], dtype=torch.long).to(device),
    torch.tensor(tokenized_val_samples['labels'], dtype=torch.long).to(device)
)

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./backup_results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=0.0001,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    remove_unused_columns=False
)

from datasets import Dataset

train_dataset = Dataset.from_dict(tokenized_train_samples)
val_dataset = Dataset.from_dict(tokenized_val_samples)
print(len(train_dataset))
print(len(val_dataset))

print(f"Pad token: {tokenizer.pad_token}")
print(f"Pad token ID: {tokenizer.pad_token_id}")
model.config.pad_token_id = tokenizer.pad_token_id
model.to(device)
model.config.num_labels = 3
print(f"Model num_labels: {model.config.num_labels}")

1000
99
Pad token: <|endoftext|>
Pad token ID: 50256
Model num_labels: 3


  torch.tensor(tokenized_train_samples['labels'], dtype=torch.long).to(device)
  torch.tensor(tokenized_val_samples['labels'], dtype=torch.long).to(device)


In [6]:
import time

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

start_time = time.time()

trainer.train()

end_time = time.time()
fine_tuning_time = end_time - start_time
print(f"Time taken to fine-tune the model: {fine_tuning_time / 60:.2f} minutes")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112920100000185, max=1.0…

Epoch,Training Loss,Validation Loss
1,1.0729,No log
2,0.6584,No log
3,0.3856,No log
4,0.2628,No log
5,0.1856,No log


Time taken to fine-tune the model: 24.09 minutes


In [8]:
!zip -r model.zip "/kaggle/working/backup_results"

  pid, fd = os.forkpty()


  adding: kaggle/working/backup_results/ (stored 0%)
  adding: kaggle/working/backup_results/checkpoint-63/ (stored 0%)
  adding: kaggle/working/backup_results/checkpoint-63/optimizer.pt (deflated 9%)
  adding: kaggle/working/backup_results/checkpoint-63/scheduler.pt (deflated 56%)
  adding: kaggle/working/backup_results/checkpoint-63/adapter_model.safetensors (deflated 8%)
  adding: kaggle/working/backup_results/checkpoint-63/adapter_config.json (deflated 52%)
  adding: kaggle/working/backup_results/checkpoint-63/README.md (deflated 66%)
  adding: kaggle/working/backup_results/checkpoint-63/trainer_state.json (deflated 57%)
  adding: kaggle/working/backup_results/checkpoint-63/rng_state.pth (deflated 25%)
  adding: kaggle/working/backup_results/checkpoint-63/training_args.bin (deflated 52%)
  adding: kaggle/working/backup_results/checkpoint-126/ (stored 0%)
  adding: kaggle/working/backup_results/checkpoint-126/optimizer.pt (deflated 9%)
  adding: kaggle/working/backup_results/checkpo