In [1]:
!pip install -q -U bitsandbytes
!!pip install -q -U accelerate
!pip install peft
!pip install datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
from datetime import datetime
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, PeftModel, get_peft_model
from datasets import load_dataset
import time
import psutil
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import re

# Define model and device
model_path = "microsoft/phi-2"  # Updated model path
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer (without `.to(device)` for quantized models)
model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_path, model_max_length=512, padding_side="left", add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token

# Load SNLI dataset with specific sampling for NLI task
snli_dataset = load_dataset('snli')
train_dataset = snli_dataset['train'].select([i for i in range(0, 550000, 550)][:1000])
val_dataset = snli_dataset['validation'].select([i for i in range(0, 10000, 100)][:100])
test_dataset = snli_dataset['test'].select([i for i in range(0, 10000, 100)][:100])

# Define tokenize functions - returning dict to avoid list concatenation issue
def tokenize(batch):
    return tokenizer(batch['premise'], batch['hypothesis'], truncation=True, max_length=512, padding="max_length")

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Improved evaluation function using regex and accuracy_score
def evaluate_model(model, tokenizer, dataset, max_length=70):
    print(f"Dataset length: {len(dataset)}")
    model.eval()
    tokenizer.pad_token_id = tokenizer.eos_token_id
    predictions = []
    true_labels = []
    label_map = {0: "entailment", 1: "neutral", 2: "contradiction"}

    for i in tqdm(range(len(dataset)), position=0, leave=True, desc="Predicting on test Samples"):
        premise = dataset['premise'][i]
        hypothesis = dataset['hypothesis'][i]
        label = dataset['label'][i]

        # Skip ambiguous label (-1) in SNLI dataset
        if label == -1:
            print(f"Skipped example {i} (ambiguous label)")
            continue
        true_labels.append(label_map[label])

        # Concatenate premise and hypothesis with a specific prompt
        input_text = (
            f"Premise: {premise}\n"
            f"Hypothesis: {hypothesis}\n"
            f"Answer with one of the following: entailment, neutral, contradiction.\nAnswer:"
        )

        # Tokenize input text
        inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)

        # Generate prediction
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=4)  # Strict limit on max_new_tokens to get concise answers
            prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

            # Use regex to extract the label directly after "Answer:"
            match = re.search(r"Answer:\s*(entailment|neutral|contradiction)", prediction, re.IGNORECASE)
            if match:
                prediction = match.group(1).lower()
            else:
                prediction = "neutral"  # Default if no match is found
            predictions.append(prediction)

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predictions)
    return accuracy, predictions

# Ensure pad_token_id is set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Evaluate the pre-trained model on the test set
print("Evaluating pre-trained model on the test set...")
pretrained_accuracy, predictions = evaluate_model(model, tokenizer, test_dataset)

# Display accuracy and sample predictions
print(f"\nPre-trained Model Accuracy: {pretrained_accuracy * 100:.2f}%")
for i, prediction in enumerate(predictions[:5]):
    print(f"Example {i + 1}: {prediction}")


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Evaluating pre-trained model on the test set...
Dataset length: 100


Predicting on test Samples:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Predicting on test Samples:   1%|          | 1/100 [00:00<00:58,  1.71it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Predicting on test Samples:   2%|▏         | 2/100 [00:01<00:50,  1.94it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Predicting on test Samples:   3%|▎         | 3/100 [00:01<00:48,  2.01it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Predicting on test Samples:   4%|▍         | 4/100 [00:02<00:49,  1.94it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Predicting on test Samples:   5%|▌         | 5/100 [00:02<00:50,  1.90it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Predicting on test Samples:   6%|▌         | 6/100 [00:03<00:49,  1.89it/s]Setting `pad_token_id` to `eos_token_id`:50256 for 


Pre-trained Model Accuracy: 50.00%
Example 1: contradiction
Example 2: entailment
Example 3: entailment
Example 4: contradiction
Example 5: entailment





In [4]:
# Display enhanced examples with premise, hypothesis, true label, and model prediction
print(f"\nPre-trained Model Accuracy: {pretrained_accuracy * 100:.2f}%")
print("\nDetailed Example Outputs:")

for i in range(10):  # Displaying 10 examples for more detailed inspection
    premise = test_dataset['premise'][i]
    hypothesis = test_dataset['hypothesis'][i]
    true_label = label_map[test_dataset['label'][i]]
    predicted_label = predictions[i]

    print(f"\nExample {i + 1}:")
    print(f"Premise: {premise}")
    print(f"Hypothesis: {hypothesis}")
    print(f"True Label: {true_label}")
    print(f"Model Prediction: {predicted_label}")



Pre-trained Model Accuracy: 50.00%

Detailed Example Outputs:

Example 1:
Premise: This church choir sings to the masses as they sing joyous songs from the book at a church.
Hypothesis: The church has cracks in the ceiling.
True Label: neutral
Model Prediction: contradiction

Example 2:
Premise: A woman within an orchestra is playing a violin.
Hypothesis: A woman is playing the violin.
True Label: entailment
Model Prediction: entailment

Example 3:
Premise: Two men climbing on a wooden scaffold.
Hypothesis: Two sad men climbing on a wooden scaffold.
True Label: neutral
Model Prediction: entailment

Example 4:
Premise: A man in a black shirt, in a commercial kitchen, holding up meat he took out of a bag.
Hypothesis: A man in a black shirt, in a commercial kitchen, holding up the old meat he took out of a bag.
True Label: neutral
Model Prediction: contradiction

Example 5:
Premise: a woman in a black shirt looking at a bicycle.
Hypothesis: A woman dressed in black shops for a bicycle.
T

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import torch
import re

# Define paths and device
base_model_path = "microsoft/phi-2"  # Path to the base model
adapter_model_path = "./snli_finetune_phi2/final_model"  # Path to the fine-tuned LoRA adapter
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set up quantization configuration to reduce memory usage
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the base model with quantization and tokenizer
base_model = AutoModelForCausalLM.from_pretrained(base_model_path, quantization_config=bnb_config).to(device)
tokenizer = AutoTokenizer.from_pretrained(base_model_path, model_max_length=512, padding_side="left", add_eos_token=True)
tokenizer.pad_token_id = tokenizer.eos_token_id

# Load the LoRA adapter model on top of the base model
model = PeftModel.from_pretrained(base_model, adapter_model_path).to(device)

# Define label map and evaluation function
label_map = {0: "entailment", 1: "neutral", 2: "contradiction"}

# Improved evaluation function using regex and accuracy_score
def evaluate_model(model, tokenizer, dataset, max_length=70):
    print(f"Dataset length: {len(dataset)}")
    model.eval()
    tokenizer.pad_token_id = tokenizer.eos_token_id
    predictions = []
    true_labels = []

    for i in tqdm(range(len(dataset)), position=0, leave=True, desc="Predicting on test Samples"):
        premise = dataset['premise'][i]
        hypothesis = dataset['hypothesis'][i]
        label = dataset['label'][i]

        # Skip ambiguous label (-1) in SNLI dataset
        if label == -1:
            print(f"Skipped example {i} (ambiguous label)")
            continue
        true_labels.append(label_map[label])

        # Concatenate premise and hypothesis with a specific prompt
        input_text = (
            f"Premise: {premise}\n"
            f"Hypothesis: {hypothesis}\n"
            f"Answer with one of the following: entailment, neutral, contradiction.\nAnswer:"
        )

        # Tokenize input text
        inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)

        # Generate prediction
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=4)  # Strict limit on max_new_tokens to get concise answers
            prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

            # Use regex to extract the label directly after "Answer:"
            match = re.search(r"Answer:\s*(entailment|neutral|contradiction)", prediction, re.IGNORECASE)
            if match:
                prediction = match.group(1).lower()
            else:
                prediction = "neutral"  # Default if no match is found
            predictions.append(prediction)

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predictions)
    return accuracy, predictions

# Ensure pad_token_id is set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Load the SNLI test dataset
from datasets import load_dataset
snli_dataset = load_dataset('snli')
test_dataset = snli_dataset['test'].select([i for i in range(0, 10000, 100)][:100])

# Evaluate the fine-tuned model on the test set
fine_tuned_accuracy, predictions = evaluate_model(model, tokenizer, test_dataset)

# Print the accuracy in the specified format
print(f"\nFine-tuned Model Accuracy: {fine_tuned_accuracy * 100:.2f}%")
print("\nDetailed Example Outputs after fine-tuning:")

# Display 10 examples with Premise, Hypothesis, True Label, and Model Prediction
for i in range(10):  # Displaying 10 examples for more detailed inspection
    premise = test_dataset['premise'][i]
    hypothesis = test_dataset['hypothesis'][i]
    true_label = label_map[test_dataset['label'][i]]
    predicted_label = predictions[i]

    print(f"\nExample {i + 1}:")
    print(f"Premise: {premise}")
    print(f"Hypothesis: {hypothesis}")
    print(f"True Label: {true_label}")
    print(f"Model Prediction: {predicted_label}")


Fine-tuned Model Accuracy: 72.0%

Detailed Example Outputs after fine-tuning:

Example 1:
Premise: This church choir sings to the masses as they sing joyous songs from the book at a church.
Hypothesis: The church has cracks in the ceiling.
True Label: neutral
Model Prediction: neutral

Example 2:
Premise: A woman within an orchestra is playing a violin.
Hypothesis: A woman is playing the violin.
True Label: entailment
Model Prediction: entailment

Example 3:
Premise: Two men climbing on a wooden scaffold.
Hypothesis: Two sad men climbing on a wooden scaffold.
True Label: neutral
Model Prediction: contradiction

Example 4:
Premise: A man in a black shirt, in a commercial kitchen, holding up meat he took out of a bag.
Hypothesis: A man in a black shirt, in a commercial kitchen, holding up the old meat he took out of a bag.
True Label: neutral
Model Prediction: neutral

Example 5:
Premise: A woman in a black shirt looking at a bicycle.
Hypothesis: A woman dressed in black shops for a bicy