In [None]:
import json
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from datasets import Dataset
from torch.utils.data import DataLoader
import torch
import pyro
import pyro.distributions as dist
from tqdm import tqdm
import os

# Load the JSON file
os.chdir('/home/pranav24/cs-546-project')
file_path = "/home/pranav24/cs-546-project/SSR/Latest_Weights/QA_Weights/task024_cosmosqa_answer_generation.json"
with open(file_path, "r") as f:
    data = json.load(f)

# Extract input-output pairs from JSON
instances = data["Instances"][4500:5000]
test_inputs = [instance["input"] for instance in instances]
test_outputs = [instance["output"][0] for instance in instances]

# print(test_inputs[499])
# print()
# print(test_outputs[499])



# Tokenizer setup
base_model_path = "meta-llama/Meta-Llama-3-8B"  # Replace with actual model path
tokenizer = AutoTokenizer.from_pretrained(base_model_path)

# Check if tokenizer has a padding token, if not, set the eos_token as padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenization function
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["input"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )
    labels = tokenizer(
        examples["output"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )["input_ids"]
    model_inputs["labels"] = labels
    return model_inputs


# Create DataLoaders
batch_size = 16  # Adjust as needed


# Define the model and load weights
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True,
)
fine_tuned_weights_path = "/home/pranav24/cs-546-project/finetuned-weights-LoRA-EVCL-Correct-Task1_VCL_best"

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    quantization_config=bnb_config,
    device_map="auto"
)
model = PeftModel.from_pretrained(base_model, fine_tuned_weights_path)
pyro.get_param_store().load('pyro_param_store_task1_evcl_best.pt')

# Ensure compatibility with the unchanged part of the code
DEVICE = model.device

# Generate predictions
predictions = []
references = []
sampled_weights_log = []  # Store sampled weights

output_file_path = "/home/pranav24/cs-546-project/predictions_EVCL_Task1_Best_QA.json"

if os.path.exists(output_file_path):
    with open(output_file_path, "r") as f:
        saved_data = json.load(f)
else:
    saved_data = {"predictions": [], "references": []}

print("Generating predictions incrementally:")

print("Generating predictions:")

for i in tqdm(range(0, len(test_inputs), batch_size)):  # Loop in batches
    print(f'At sample {i}')
    batch_inputs = test_inputs[i:i + batch_size]
    batch_references = test_outputs[i:i + batch_size]

    # Tokenize the inputs in a batch
    inputs_tokenized = tokenizer(batch_inputs, padding=True, truncation=True, return_tensors="pt").to(DEVICE)

    with torch.no_grad():
        with torch.cuda.amp.autocast():
            # Apply Pyro parameters to LoRA layers
            for name, module in model.named_modules():
                if hasattr(module, "lora_A"):
                    for key in module.lora_A:
                        loc = pyro.param(f"{name}.lora_A.{key}_loc")
                        scale = pyro.param(f"{name}.lora_A.{key}_scale")
                        sampled_weight = pyro.sample(
                            f"{name}.lora_A.{key}",
                            dist.Normal(loc, scale).to_event(loc.dim())
                        )
                        sampled_weights_log.append(
                            (name, key, sampled_weight.clone().cpu().numpy())
                        )
                        module.lora_A[key].weight.data.copy_(sampled_weight)

                if hasattr(module, "lora_B"):
                    for key in module.lora_B:
                        loc = pyro.param(f"{name}.lora_B.{key}_loc")
                        scale = pyro.param(f"{name}.lora_B.{key}_scale")
                        sampled_weight = pyro.sample(
                            f"{name}.lora_B.{key}",
                            dist.Normal(loc, scale).to_event(loc.dim())
                        )
                        sampled_weights_log.append(
                            (name, key, sampled_weight.clone().cpu().numpy())
                        )
                        module.lora_B[key].weight.data.copy_(sampled_weight)

            # Generate predictions using the tokenized inputs
            generated_ids = model.generate(
                input_ids=inputs_tokenized["input_ids"],
                attention_mask=inputs_tokenized["attention_mask"],
                max_new_tokens=512,  
                num_return_sequences=1,
                do_sample=True  
            )

        # Decode generated IDs
        batch_predictions = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        predictions.extend(batch_predictions)
        references.extend(batch_references)

        saved_data["predictions"].extend(batch_predictions)
        saved_data["references"].extend(batch_references)

        # Save incrementally to JSON
        with open(output_file_path, "w") as json_file:
            json.dump(saved_data, json_file, indent=4)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Generating predictions incrementally:
Generating predictions:


  0%|          | 0/32 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


At sample 0


  with torch.cuda.amp.autocast():
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  3%|▎         | 1/32 [11:28<5:55:41, 688.42s/it]

At sample 16


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  6%|▋         | 2/32 [22:31<5:36:53, 673.80s/it]

At sample 32


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  9%|▉         | 3/32 [33:46<5:25:54, 674.30s/it]

At sample 48


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [None]:
!pip install evaluate

In [None]:
import evaluate
rouge = evaluate.load("rouge")
results = rouge.compute(predictions=predictions, references=references)

# Display the results
print("\nROUGE Scores:")
print(results)

In [None]:
import json

# Create a dictionary to store the data
data = {
    "batch_predictions": batch_predictions,
    "predictions": predictions,
    "references": references
}

# Save to a JSON file
with open("predictions_EVCL_Task1_Best.json", "w") as json_file:
    json.dump(data, json_file, indent=4)

print("Data saved to predictions.json")



In [2]:
# test_inputs=["Context: Then I drove all around to the used DVD stores to get season 2. ( which I found for 20 bucks! ) I have been watching a disk a night, and not going to bed until 3 am because I knew that the new season was starting soon and I didn't want to be behind. I just finished season 2 and was on the edge of my seat the whole last episode. Yesterday I taped what was on tv. Question: What does the narrator think about the price of the DVD set purchased?"]
import json
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from datasets import Dataset
from torch.utils.data import DataLoader
import torch
import pyro
import pyro.distributions as dist
from tqdm import tqdm

test_inputs=["Context: I just told them that the most important part is that you are responsible about it . That we were always taught to respect it and when we were old enough we were taught the proper way to use it . They did n't have much to say about that . Question: What kind of object might require us to respect it and learn the proper way to use it ?"]
batch_size=1
final_answer=[]

base_model_path = "meta-llama/Meta-Llama-3-8B"

fine_tuned_weights_path="/home/pranav24/cs-546-project/finetuned-weights-LoRA-EVCL-Correct-Task1_VCL_best"
pyro.get_param_store().load('pyro_param_store_task1_evcl_best.pt')

tokenizer = AutoTokenizer.from_pretrained(base_model_path)

# Check if tokenizer has a padding token, if not, set the eos_token as padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenization function
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["input"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )
    labels = tokenizer(
        examples["output"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )["input_ids"]
    model_inputs["labels"] = labels
    return model_inputs
# pyro.clear_param_store()

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True,
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    quantization_config=bnb_config,
    device_map="auto"
)
model = PeftModel.from_pretrained(base_model, fine_tuned_weights_path)
# pyro.get_param_store().load('pyro_param_store_task1_vcl_best.pt')
# pyro.get_param_store().load('pyro_param_store_task1_vcl_best.pt')

# Ensure compatibility with the unchanged part of the code
DEVICE = model.device

sampled_weights_log=[]
for i in tqdm(range(0, len(test_inputs), batch_size)):  # Loop in batches
    batch_inputs = test_inputs[i:i + batch_size]
    # batch_references = test_outputs[i:i + batch_size]

    # Tokenize the inputs in a batch
    inputs_tokenized = tokenizer(batch_inputs, padding=True, truncation=True, return_tensors="pt").to(DEVICE)

    with torch.no_grad():
        with torch.cuda.amp.autocast():
            # Apply Pyro parameters to LoRA layers
            for name, module in model.named_modules():
                if hasattr(module, "lora_A"):
                    for key in module.lora_A:
                        loc = pyro.param(f"{name}.lora_A.{key}_loc")
                        scale = pyro.param(f"{name}.lora_A.{key}_scale")
                        sampled_weight = pyro.sample(
                            f"{name}.lora_A.{key}",
                            dist.Normal(loc, scale).to_event(loc.dim())
                        )
                        sampled_weights_log.append(
                            (name, key, sampled_weight.clone().cpu().numpy())
                        )
                        module.lora_A[key].weight.data.copy_(sampled_weight)

                if hasattr(module, "lora_B"):
                    for key in module.lora_B:
                        loc = pyro.param(f"{name}.lora_B.{key}_loc")
                        scale = pyro.param(f"{name}.lora_B.{key}_scale")
                        sampled_weight = pyro.sample(
                            f"{name}.lora_B.{key}",
                            dist.Normal(loc, scale).to_event(loc.dim())
                        )
                        sampled_weights_log.append(
                            (name, key, sampled_weight.clone().cpu().numpy())
                        )
                        module.lora_B[key].weight.data.copy_(sampled_weight)

            # Generate predictions using the tokenized inputs
            print(sampled_weights_log)
            generated_ids = model.generate(
                input_ids=inputs_tokenized["input_ids"],
                attention_mask=inputs_tokenized["attention_mask"],
                max_length=512,  # Adjust as needed
                num_return_sequences=1,
                do_sample=True  # Optional: Sampling for diverse generations
            )

        # Decode generated IDs
        final_batch_predictions = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        final_answer.extend(final_batch_predictions)
        # predictions.extend(batch_predictions)
        # references.extend(batch_references)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  with torch.cuda.amp.autocast():
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


[('base_model.model.model.layers.0.self_attn.q_proj', 'default', array([[ 0.03311223, -0.00169148, -0.00958684, ..., -0.01860498,
        -0.02253262, -0.00979476],
       [ 0.01243316, -0.01514597,  0.00481014, ..., -0.01094201,
         0.00284378, -0.01428204],
       [ 0.00187765,  0.03313154,  0.00218942, ..., -0.00531076,
         0.00221143, -0.00853566],
       ...,
       [ 0.00187588,  0.01154761, -0.02193665, ...,  0.00869411,
         0.01603812, -0.01638471],
       [-0.00731055, -0.02146861, -0.0086183 , ...,  0.01371687,
         0.00220051, -0.02109133],
       [-0.00458093, -0.01793976, -0.00837809, ..., -0.01039833,
        -0.00947752, -0.00122473]], dtype=float32)), ('base_model.model.model.layers.0.self_attn.q_proj', 'default', array([[ 0.00525719, -0.00711148, -0.01912612, ..., -0.01110874,
        -0.00175554, -0.0036114 ],
       [-0.01283539, -0.01090047,  0.00405082, ..., -0.00630049,
         0.02637058, -0.00824784],
       [ 0.00883863, -0.00513361, -0.0184

100%|██████████| 1/1 [07:00<00:00, 420.15s/it]


In [3]:
final_answer

["Context: I just told them that the most important part is that you are responsible about it. That we were always taught to respect it and when we were old enough we were taught the proper way to use it. They didn't have much to say about that. Question: What kind of object might require us to respect it and learn the proper way to use it? \nQuestion: What might be the most important part of it?We are responsible for it.. \nFact: We were taught the proper way to use it when we were old enough.I just told them that the most important part is that you are responsible about it. That we were always taught to respect it and when we were old enough we were taught the proper way to use it. They didn't have much to say about that.Question: What kind of object might require us to respect it and learn the proper way to use it? A car. Question: What might be the most important part of it?We are responsible for it. Fact: We were taught the proper way to use it when we were old enough.I just told 

In [23]:
file_path = "/home/pranav24/cs-546-project/SSR/Latest_Weights/QA_Weights/task024_cosmosqa_answer_generation.json"
with open(file_path, "r") as f:
    data = json.load(f)

# Extract input-output pairs from JSON
instances = data["Instances"][2500:5000]
inputs = [instance["input"] for instance in instances]
outputs = [instance["output"][0] for instance in instances]

# Split the data into train and test sets
train_inputs, test_inputs, train_outputs, test_outputs = train_test_split(
    inputs, outputs, test_size=0.2, random_state=42
)

In [25]:
print(test_inputs[0])

Context: When Derek joined her , the door man unhooked the rope to let them through , still watching Lucy with undisguised lust as they walked in . A look of utter confusion shown on Derek 's features as they were granted entrance into the club . He draped an arm over Lucy 's shoulders and they made their way through the crowds of people . " So how 'd you do that ?. 
Question: What may be the reason for Derek 's confusion ?
