In [10]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [11]:
import json
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from datasets import Dataset
from torch.utils.data import DataLoader
import torch
import pyro
import pyro.distributions as dist
from tqdm import tqdm

# Load the JSON file
file_path = "/home/pranav24/cs-546-project/SSR/Latest_Weights/QA_Weights/task024_cosmosqa_answer_generation.json"
with open(file_path, "r") as f:
    data = json.load(f)

# Extract input-output pairs from JSON
instances = data["Instances"][2500:5000]
inputs = [instance["input"] for instance in instances]
outputs = [instance["output"][0] for instance in instances]

# Split the data into train and test sets
train_inputs, test_inputs, train_outputs, test_outputs = train_test_split(
    inputs, outputs, test_size=0.2, random_state=42
)

# Convert data to Hugging Face Dataset format
train_ds = Dataset.from_dict({"input": train_inputs, "output": train_outputs})
test_ds = Dataset.from_dict({"input": test_inputs, "output": test_outputs})

# Tokenizer setup
base_model_path = "meta-llama/Meta-Llama-3-8B"  # Replace with actual model path
tokenizer = AutoTokenizer.from_pretrained(base_model_path)

# Check if tokenizer has a padding token, if not, set the eos_token as padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenization function
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["input"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )
    labels = tokenizer(
        examples["output"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )["input_ids"]
    model_inputs["labels"] = labels
    return model_inputs

# Tokenize datasets
tokenized_train_ds = train_ds.map(tokenize_function, batched=True, remove_columns=["input", "output"])
tokenized_test_ds = test_ds.map(tokenize_function, batched=True, remove_columns=["input", "output"])

# Convert datasets to PyTorch format
tokenized_train_ds.set_format("torch")
tokenized_test_ds.set_format("torch")

# Create DataLoaders
batch_size = 16  # Adjust as needed
train_loader = DataLoader(tokenized_train_ds, batch_size=batch_size, shuffle=True)
eval_loader = DataLoader(tokenized_test_ds, batch_size=batch_size)

# Define the model and load weights
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True,
)
fine_tuned_weights_path = "/home/pranav24/cs-546-project/finetuned-weights-LoRA-EVCL-Final-Task2_EVCL_best"

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    quantization_config=bnb_config,
    device_map="auto"
)
model = PeftModel.from_pretrained(base_model, fine_tuned_weights_path)
pyro.get_param_store().load('pyro_param_store_task2_vcl_best.pt')

# Ensure compatibility with the unchanged part of the code
DEVICE = model.device

# Generate predictions
predictions = []
references = []
sampled_weights_log = []  # Store sampled weights

print("Generating predictions:")

for i in tqdm(range(0, len(test_inputs), batch_size)):  # Loop in batches
    batch_inputs = test_inputs[i:i + batch_size]
    batch_references = test_outputs[i:i + batch_size]

    # Tokenize the inputs in a batch
    inputs_tokenized = tokenizer(batch_inputs, padding=True, truncation=True, return_tensors="pt").to(DEVICE)

    with torch.no_grad():
        with torch.cuda.amp.autocast():
            # Apply Pyro parameters to LoRA layers
            for name, module in model.named_modules():
                if hasattr(module, "lora_A"):
                    for key in module.lora_A:
                        loc = pyro.param(f"{name}.lora_A.{key}_loc")
                        scale = pyro.param(f"{name}.lora_A.{key}_scale")
                        sampled_weight = pyro.sample(
                            f"{name}.lora_A.{key}",
                            dist.Normal(loc, scale).to_event(loc.dim())
                        )
                        sampled_weights_log.append(
                            (name, key, sampled_weight.clone().cpu().numpy())
                        )
                        module.lora_A[key].weight.data.copy_(sampled_weight)

                if hasattr(module, "lora_B"):
                    for key in module.lora_B:
                        loc = pyro.param(f"{name}.lora_B.{key}_loc")
                        scale = pyro.param(f"{name}.lora_B.{key}_scale")
                        sampled_weight = pyro.sample(
                            f"{name}.lora_B.{key}",
                            dist.Normal(loc, scale).to_event(loc.dim())
                        )
                        sampled_weights_log.append(
                            (name, key, sampled_weight.clone().cpu().numpy())
                        )
                        module.lora_B[key].weight.data.copy_(sampled_weight)

            # Generate predictions using the tokenized inputs
            generated_ids = model.generate(
                input_ids=inputs_tokenized["input_ids"],
                attention_mask=inputs_tokenized["attention_mask"],
                max_length=1000,  # Adjust as needed
                num_return_sequences=1,
                do_sample=True  # Optional: Sampling for diverse generations
            )

        # Decode generated IDs
        batch_predictions = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        predictions.extend(batch_predictions)
        references.extend(batch_references)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Generating predictions:


  0%|          | 0/32 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  with torch.cuda.amp.autocast():
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  3%|▎         | 1/32 [02:47<1:26:41, 167.80s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  6%|▋         | 2/32 [05:59<1:30:54, 181.83s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_sid

In [12]:
from evaluate import load

# Load the ROUGE metric
rouge = load("rouge")

# Compute ROUGE scores
results = rouge.compute(predictions=predictions, references=references)

# Display the results
print("\nROUGE Scores:")
print(results)

ModuleNotFoundError: No module named 'evaluate'