In [None]:
#Saving the Fine-tuned model
from transformers import AutoModelForCausalLM
output_dir = "/content/output_dir"


final_model = AutoModelForCausalLM.from_pretrained(
output_dir, local_files_only=True,
quantization_config=bnb_config,
trust_remote_code=True
)

In [1]:
#Evaluation using Loss function
import torch
import pandas as pd
from tqdm import tqdm

# Define the inference function
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
    input_ids = tokenizer.encode(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=max_input_tokens
    )

    # Ensure that pad_token_id is set for open-end generation
    model.config.pad_token_id = model.config.eos_token_id

    # Generate sequences while explicitly setting attention_mask
    generated_tokens_with_prompt = model.generate(
        input_ids=input_ids.to(model.device),
        max_length=max_output_tokens,
        pad_token_id=model.config.eos_token_id,
        attention_mask=input_ids.to(model.device)  # Setting attention mask
    )

    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)
    generated_text_answer = generated_text_with_prompt[0][len(text):]

    # Tokenize the generated text
    generated_tokens_answer = tokenizer.encode(generated_text_answer, return_tensors="pt").squeeze()

    return generated_tokens_answer

# Define a function to calculate the loss between predicted and target outputs
def calculate_loss(predicted_tokens, target_tokens):
    # Calculate the loss using a suitable loss function (e.g., CrossEntropyLoss for token-level comparison)
    loss_function = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
    loss = loss_function(predicted_tokens, target_tokens)

    return loss.item()  # Return the loss value as a scalar


evaluation_dataset = split_dataset["test"]

# Define an empty list to store loss values
losses = []
num_samples_to_process = 10
# Evaluate the loss for each item in the test dataset
for i, item in tqdm(enumerate(evaluation_dataset[:num_samples_to_process])):
    print("i Evaluating: " + str(item))
    question = item['instruction']
    answer = item['output']

    try:
        predicted_tokens = inference(question, final_model, tokenizer)
        target_tokens = tokenizer.encode(answer, return_tensors="pt").squeeze()
        loss_value = calculate_loss(predicted_tokens, target_tokens)
        losses.append(loss_value)
    except:
        continue

# Calculate the average loss across all predictions
average_loss = sum(losses) / len(losses) if losses else 0
print(f"Average Loss: {average_loss}")


In [None]:
#Evaluation by Comparision
import pandas as pd
from tqdm import tqdm

# Define a function to check exact match between answers
def is_exact_match(a, b):
    return a.strip() == b.strip()

# Define a function for inference
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
    tokenizer.pad_token = tokenizer.eos_token
    input_ids = tokenizer.encode(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=max_input_tokens
    )

    generated_tokens_with_prompt = model.generate(
        input_ids=input_ids.to(model.device),
        max_length=max_output_tokens
    )

    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)
    generated_text_answer = generated_text_with_prompt[0][len(text):]

    return generated_text_answer

# Load the evaluation dataset

evaluation_dataset = split_dataset["test"]

# Modify the evaluation loop for your split test dataset
metrics = {'exact_matches': []}
predictions = []

for i, item in tqdm(enumerate(evaluation_dataset)):
    question = item['instruction']
    answer = item['output']

    try:
        predicted_answer = inference(question, final_model, tokenizer)  # Use your trained peft_model and tokenizer
    except:
        continue

    predictions.append([predicted_answer, answer])
    exact_match = is_exact_match(predicted_answer, answer)
    metrics['exact_matches'].append(exact_match)

print('Number of exact matches: ', sum(metrics['exact_matches']))
df = pd.DataFrame(predictions, columns=["predicted_output", "target_output"])
print(df)
