In [1]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git
!pip install evaluate scikit-learn
import torch
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, Dataset, DatasetDict
from peft import PeftModel, PeftConfig
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_name='microsoft/phi-2'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"":0},
    trust_remote_code=True,
    num_labels=3,
    low_cpu_mem_usage=True
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

ft_model = PeftModel.from_pretrained(model, "/kaggle/input/phi2-inference/kaggle/working/peft-snli/final-checkpoint/checkpoint-2500",torch_dtype=torch.float16,is_trainable=False)
ft_model.config.pad_token_id = tokenizer.pad_token_id

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of PhiForSequenceClassification were not initialized from the model checkpoint at microsoft/phi-2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
data = load_dataset("stanfordnlp/snli")
train_samples = Dataset.from_dict(data['train'].select(range(0, 550152, 550))[:1000])
test_samples = Dataset.from_dict(data['test'].select(range(0, 10000, 100))[:100])
validation_samples = Dataset.from_dict(data['validation'].select(range(0, 10000, 100))[:100])


README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/412k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/413k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/550152 [00:00<?, ? examples/s]

In [32]:
!pip install -q evaluate
import evaluate
import pandas as pd
from tqdm import tqdm
metric = evaluate.load("accuracy")
def infer_tuned(sample):
	premise, hypothesis, label = sample['premise'], sample['hypothesis'], sample['label']
	inputs = tokenizer(premise, hypothesis, return_tensors='pt', padding='max_length', truncation=True, max_length=128).to(model.device)
	with torch.no_grad():
		outputs = ft_model(**inputs)
		logits = outputs.logits
		predictions = torch.argmax(logits, dim=1).item()
                
	label_dict = {0: 'entailment', 1: 'neutral', 2: 'contradiction', -1: 'unknown'}
	return predictions, label

def compute_accuracy_tuned(dataset):
    predictions = []
    labels, results = [], []
    for sample in tqdm(dataset):
        prediction, label = infer_tuned(sample)
        predictions.append(prediction)
        labels.append(label)
        results.append({"sample": sample, "prediction": prediction, "label": label})
    df = pd.DataFrame(results)
    df.to_csv("finetuned_predictions.csv", index=False)
    return metric.compute(predictions=predictions, references=labels)

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
pretrained_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"":0},
    trust_remote_code=True,
    num_labels=3,
    low_cpu_mem_usage=True
)
tokenizer_pre = AutoTokenizer.from_pretrained(model_name)
tokenizer_pre.pad_token = tokenizer_pre.eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of PhiForSequenceClassification were not initialized from the model checkpoint at microsoft/phi-2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
def infer_pretrained(sample):
	premise, hypothesis, label = sample['premise'], sample['hypothesis'], sample['label']
	inputs = tokenizer_pre(premise, hypothesis, return_tensors='pt', padding='max_length', truncation=True, max_length=128).to(model.device)
	with torch.no_grad():
		outputs = pretrained_model(**inputs)
		logits = outputs.logits
		predictions = torch.argmax(logits, dim=1).item()
                
	label_dict = {0: 'entailment', 1: 'neutral', 2: 'contradiction', -1: 'unknown'}
	return predictions, label

def compute_accuracy_pretrained(dataset):
    predictions = []
    labels = []
    results = []
    for sample in tqdm(dataset):
        prediction, label = infer_pretrained(sample)
        predictions.append(prediction)
        labels.append(label)
        results.append({"sample": sample, "prediction": prediction, "label": label})
    df = pd.DataFrame(results)
    df.to_csv("pretrained_predictions.csv", index=False)
    return metric.compute(predictions=predictions, references=labels)

In [49]:
compute_accuracy_pretrained(test_samples)

100%|██████████| 100/100 [00:28<00:00,  3.51it/s]


{'accuracy': 0.33}

In [50]:
compute_accuracy_tuned(test_samples)

100%|██████████| 100/100 [00:30<00:00,  3.28it/s]


{'accuracy': 0.85}