In [None]:
import torch
import torch_xla.core.xla_model as xm
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
model_name = "nvidia/Llama-3.1-Nemotron-Nano-4B-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(xm.xla_device())
model

## Prompt Generation

In [None]:
prompt = "The future of AI is"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(xm.xla_device())
input_ids

In [None]:
num_steps = 10
generated_ids = input_ids

for step in range(num_steps):
    with torch.no_grad():
        outputs = model(generated_ids)
        logits = outputs.logits

    next_token_logits = logits[:, -1, :]
    next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1)

    generated_ids = torch.cat([generated_ids, next_token_id], dim=-1)

generated_ids

In [None]:
generated_text = tokenizer.decode(generated_ids.squeeze(), skip_special_tokens=True)
print("Generated text:\n", generated_text)

## Multiple-Choice Prompt

In [None]:
prompt = """
Question: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?

Choices:
A. 22.0
B. 64.0
C. 18.0
D. 12.0
Answer:"""

# Tokenize input
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(xm.xla_device())
input_ids

In [None]:
# Forward pass to get logits
with torch.no_grad():
    outputs = model(input_ids)
    logits = outputs.logits

# Get logits for the next token (after the prompt)
next_token_logits = logits[:, -1, :]

In [None]:
# Define allowed answer tokens (A, B, C, D)
valid_choices = ["A", "B", "C", "D"]
valid_token_ids = [
    tokenizer.encode(choice, add_special_tokens=False)[0] for choice in valid_choices
]
valid_token_ids

In [None]:
mask = torch.full_like(next_token_logits, float("-inf"))
mask[:, valid_token_ids] = next_token_logits[:, valid_token_ids]
next_token_id = torch.argmax(mask, dim=-1).unsqueeze(-1)
next_token_id

In [None]:
# Decode predicted choice
predicted_choice = tokenizer.decode(next_token_id.squeeze(), skip_special_tokens=True)
print("Predicted Answer:", predicted_choice)

## Making Inference

In [None]:
import os
os.chdir("../")
os.getcwd()

In [None]:
from src.data_loader import GSM_MC_PromptBuilder
from src.models import MultipleChoiceLLM
from src.config import ConfigurationManager
from tqdm import tqdm
import pandas as pd

In [None]:
config_file_path = "config.yaml"
config = ConfigurationManager(config_file_path=config_file_path)
dataset_config = config.get_dataset_configuration()
model_config = config.get_model_configuration()

prompt_builder = GSM_MC_PromptBuilder(
    dataset_config.dataset_name,
    data_files=dataset_config.data_files,
    split=dataset_config.split,
    max_samples=dataset_config.max_samples,
)

In [None]:
model_name = model_config.model_name
allowed_choices = model_config.allowed_choices
model = MultipleChoiceLLM(model_name=model_name, allowed_choices=allowed_choices)

In [None]:
outputs = prompt_builder.generate_prompts_and_metadata()

In [None]:
results = []
for sample in outputs:
    prompt = sample["prompt"]
    prediction = model.predict(prompt)

    results.append({
    "sample_id": sample["sample_id"],
    "question": sample["question"],
    "choice_A": sample["choices"].get("A", ""),
    "choice_B": sample["choices"].get("B", ""),
    "choice_C": sample["choices"].get("C", ""),
    "choice_D": sample["choices"].get("D", ""),
    "prompt": sample["prompt"],
    "answer": sample["answer"],
    "prediction": prediction,
    })

In [None]:
pd.DataFrame(results)

In [None]:
from src.inference import ModelInferencePipeline
from src.config import ConfigurationManager
from src.common import create_directory

In [None]:
pipeline = ModelInferencePipeline()

In [None]:
df = pipeline.run_inference()

## DataLoader Implementation (Multi-process)

In [None]:
import os
from datetime import datetime
os.chdir("/home/sermengi/llm-bias-fairness-eval")
os.getcwd()

In [None]:
import torch
from torch.utils.data import DataLoader

In [None]:
from src.data_loader import GSM_MC_PromptBuilder
from src.config import ConfigurationManager

In [None]:
config_manager = ConfigurationManager("config.yaml")
dataset_config = config_manager.get_dataset_configuration()

In [None]:
gsm_dataset = GSM_MC_PromptBuilder(
    dataset_config.dataset_name,
    data_files=dataset_config.data_files,
    split=dataset_config.split,
    max_samples=dataset_config.max_samples
    )

In [None]:
dataloader = DataLoader(
    dataset=gsm_dataset,
    batch_size=2,
    shuffle=False,
    num_workers=0,
)

In [None]:
for i, batch in enumerate(dataloader):
    if i >= 3:
        break
    print(f"\n--- Batch {i+1} ---")
    print(f"  Sample IDs: {batch['sample_id']}")
    
    if len(batch['prompt']) > 0:
            print("\n  First item in batch:")
            print(f"    Sample ID: {batch['sample_id'][0]}")
            print(f"    Question: {batch['question'][0][:100]}...") # Print first 100 chars
            print(f"    Choices: {batch['choices']['A'][0]}, {batch['choices']['B'][0]}, ...") # Example of accessing choices
            print(f"    Prompt: {batch['prompt'][0][:150]}...") # Print first 150 chars
            print(f"    Answer: {batch['answer'][0]}")

In [None]:
if len(gsm_dataset) > 0 and i == 0 and len(dataloader) == 0 : # Check if dataloader itself is empty even if dataset is not
    print(f"\nWarning: Dataset has {len(gsm_dataset)} samples, but DataLoader yielded 0 batches.")
    print("This might happen if batch_size > number of samples and drop_last=True, or other DataLoader issues.")
elif i < 3 and i < (len(gsm_dataset) // 2) :
    print(f"\nNote: Printed {i+1} batches. There might be more batches available in the DataLoader.")

## Model Implementation (Multi-process)

In [None]:
import os
os.chdir("../")
os.getcwd()

In [None]:
from src.models import MultipleChoiceLLM

In [None]:
model_name = "nvidia/Llama-3.1-Nemotron-Nano-4B-v1.1"
allowed_choices = []
padding_side = "left"

llm = MultipleChoiceLLM(
            model_name=model_name,
            allowed_choices=allowed_choices,
            tokenizer_padding_side=padding_side
        )
print(f"MultipleChoiceLLM initialized. Using device: {llm.device}")

In [None]:
prompt1_text = (
        "Question: What is the capital of France?\n"
        "Choices:\nA. London\nB. Berlin\nC. Paris\nD. Madrid\n"
        "Answer: "
    )
prompt2_text = (
    "Question: Which planet is known as the Red Planet?\n"
    "Choices:\nA. Earth\nB. Mars\nC. Jupiter\nD. Venus\n"
    "Answer: "
)
prompt3_text = ( # A slightly different style
    "Solve for x: 2x + 3 = 7\n"
    "Options:\nA. 1\nB. 2\nC. 3\nD. 4\n"
    "The correct option is: "
)
prompts_batch = [prompt1_text, prompt2_text, prompt3_text]

In [None]:
batch_predictions = llm.predict(prompts_batch)

In [None]:
batch_predictions

In [None]:
single_prompt = prompt1_text
single_prediction = llm.predict(single_prompt)

In [None]:
single_prediction

## Refactoring Context Generation Code 

In [None]:
import os
os.chdir("../")
os.getcwd()

In [None]:
from src.config import ConfigurationManager
from src.context_generator import ContextGenerator

In [None]:
config_manager = ConfigurationManager(
    config_file_path="config.yaml",
    context_config_file_path="configs/context_templates.yaml"
)

In [None]:
context_config = config_manager.get_contexts_configuration()

In [None]:
context_generator = ContextGenerator(context_config)

In [None]:
context_generator.generate_contexts()

In [None]:
context_generator.save_generated_contexts()

## Evaluating Results

In [None]:
import os
os.chdir("../")
os.getcwd()

In [None]:
from src.config import ConfigurationManager
from src.evaluation import ModelEvaluator
import pandas as pd
import mlflow

In [None]:
config_manager = ConfigurationManager(
    config_file_path="configs/config.yaml",
    context_config_file_path="configs/context_templates.yaml",
)

In [None]:
# class ModelEvaluator:
#     def __init__(self, config):
#         self.config = config
#         self.artifacts_root = self.config.artifacts_root
#         self.prediction_file_path = self.config.prediction_file_path
#         self.mlflow_run_id = self.config.mlflow_run_id
#         self.predictions_df = None

#         if not self.mlflow_run_id and not self.prediction_file_path:
#             print("Either 'mlflow_run_id' or 'prediction_file_path' must be provided in the config.")
        
#     def _get_predictions_from_mlflow(self):
#         if not self.mlflow_run_id:
#             print("MLflow Run ID not provided. Skipping MLflow artifact fetching.")
#             return None

#         try:
#             print(f"Attempting to fetch predictions from MLflow Run ID: {self.mlflow_run_id}")
#             artifact_uri = f"runs:/{self.mlflow_run_id}/{self.prediction_file_path}"
#             local_artifact_path = mlflow.artifacts.download_artifacts(
#                 artifact_uri=artifact_uri,
#             )
#             print(f"Loading predictions from MLflow artifact path: {local_artifact_path}")
#             df = pd.read_csv(local_artifact_path).sort_values(by="prompt_id").set_index("prompt_id")
#             return df
#         except Exception as e:
#             print(f"Could not fetch predictions from MLflow: {e}")
#             return None
    
#     def _get_predictions_from_local(self):
#         if not self.prediction_file_path:
#             print("Local predictions folder path not provided. Skipping local file loading.")
#             return None
        
#         if os.path.exists(self.prediction_file_path):
#             print(f"Loading predictions from local path: {self.prediction_file_path}")
#             try:
#                 df = pd.read_csv(self.prediction_file_path)
#                 return df
#             except Exception as e:
#                 print(f"Error loading local predictions file '{self.prediction_file_path}': {e}")
#                 return None
#         else:
#             print(f"Local predictions file/folder not found at: {self.prediction_file_path}")
#             return None

#     def load_predictions(self):
#         if self.mlflow_run_id:
#             self.predictions_df = self._get_predictions_from_mlflow()
        
#         if self.predictions_df is None:
#             self.predictions_df = self._get_predictions_from_local()
        
#         if self.predictions_df is None:
#             raise RuntimeError("Failed to load predictions from both MLflow and local paths.")

#         required_columns = ["context_category", "context_identity", "answer", "prediction"]
#         if not all(col in self.predictions_df.columns for col in required_columns):
#             raise ValueError(f"Missing one or more required columns in predictions file. "
#                              f"Expected: {required_columns}, Found: {self.predictions_df.columns.tolist()}")

#         return self.predictions_df

In [None]:
model_eval = ModelEvaluator(config)

In [None]:
model_eval.load_predictions()

In [None]:
df