In [16]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git
!pip install evaluate scikit-learn
import torch
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, Dataset, DatasetDict
from peft import PeftModel, PeftConfig
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_name='microsoft/phi-2'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"":0},
    trust_remote_code=True,
    num_labels=3,
    low_cpu_mem_usage=True
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

ft_model = PeftModel.from_pretrained(model, "/kaggle/input/phi2-inference/kaggle/working/peft-snli/final-checkpoint/checkpoint-315",torch_dtype=torch.float16,is_trainable=False)
ft_model.config.pad_token_id = tokenizer.pad_token_id

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of PhiForSequenceClassification were not initialized from the model checkpoint at microsoft/phi-2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
data = load_dataset("stanfordnlp/snli")
train_samples = Dataset.from_dict(data['train'].select(range(0, 550152, 550))[:1000])
test_samples = Dataset.from_dict(data['test'].select(range(0, 10000, 100))[:100])
validation_samples = Dataset.from_dict(data['validation'].select(range(0, 10000, 100))[:100] )

In [18]:
!pip install -q evaluate
import evaluate
import pandas as pd
from tqdm import tqdm
metric = evaluate.load("accuracy")
def infer_tuned(sample):
	premise, hypothesis, label = sample['premise'], sample['hypothesis'], sample['label']
	inputs = tokenizer(premise, hypothesis, return_tensors='pt', padding='max_length', truncation=True, max_length=128).to(model.device)
	with torch.no_grad():
		outputs = ft_model(**inputs)
		logits = outputs.logits
		predictions = torch.argmax(logits, dim=1).item()
                
	label_dict = {0: 'entailment', 1: 'neutral', 2: 'contradiction', -1: 'unknown'}
	return predictions, label

def compute_accuracy_tuned(dataset):
    predictions = []
    labels, results = [], []
    for sample in tqdm(dataset):
        prediction, label = infer_tuned(sample)
        predictions.append(prediction)
        labels.append(label)
        results.append({"sample": sample, "prediction": prediction, "label": label})
    df = pd.DataFrame(results)
    df.to_csv("finetuned_predictions.csv", index=False)
    return metric.compute(predictions=predictions, references=labels)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [19]:
pretrained_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"":0},
    trust_remote_code=True,
    num_labels=3,
    low_cpu_mem_usage=True
)
tokenizer_pre = AutoTokenizer.from_pretrained(model_name)
tokenizer_pre.pad_token = tokenizer_pre.eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of PhiForSequenceClassification were not initialized from the model checkpoint at microsoft/phi-2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
def infer_pretrained(sample):
	premise, hypothesis, label = sample['premise'], sample['hypothesis'], sample['label']
	inputs = tokenizer_pre(premise, hypothesis, return_tensors='pt', padding='max_length', truncation=True, max_length=128).to(model.device)
	with torch.no_grad():
		outputs = pretrained_model(**inputs)
		logits = outputs.logits
		predictions = torch.argmax(logits, dim=1).item()
                
	label_dict = {0: 'entailment', 1: 'neutral', 2: 'contradiction', -1: 'unknown'}
	return predictions, label

def compute_accuracy_pretrained(dataset):
    predictions = []
    labels = []
    results = []
    for sample in tqdm(dataset):
        prediction, label = infer_pretrained(sample)
        predictions.append(prediction)
        labels.append(label)
        results.append({"sample": sample, "prediction": prediction, "label": label})
    df = pd.DataFrame(results)
    df.to_csv("pretrained_predictions.csv", index=False)
    return metric.compute(predictions=predictions, references=labels)

In [21]:
compute_accuracy_pretrained(test_samples)

100%|██████████| 100/100 [00:28<00:00,  3.51it/s]


{'accuracy': 0.36}

In [22]:
compute_accuracy_tuned(test_samples)

100%|██████████| 100/100 [00:30<00:00,  3.27it/s]


{'accuracy': 0.85}

In [23]:
compute_accuracy_pretrained(validation_samples)

100%|██████████| 100/100 [00:28<00:00,  3.52it/s]


{'accuracy': 0.33}

In [24]:
compute_accuracy_tuned(validation_samples)

100%|██████████| 100/100 [00:30<00:00,  3.27it/s]


{'accuracy': 0.82}

In [25]:
compute_accuracy_pretrained(train_samples)

100%|██████████| 1000/1000 [04:44<00:00,  3.51it/s]


{'accuracy': 0.32}

In [26]:
compute_accuracy_tuned(train_samples)

100%|██████████| 1000/1000 [05:05<00:00,  3.27it/s]


{'accuracy': 0.943}

In [27]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 18350080 || all params: 1408634880 || trainable%: 1.3026853346127565
