In [None]:
from utils.models import get_model, get_tokenizer, reward_model
from utils.data_loader import get_data
from utils.reward_scoring import generate_output, get_rewards
from utils.preference_generation import determine_preference
from peft import PeftModel
import pandas as pd
import numpy as np
from dotenv import load_dotenv #Load HuggingFace Token
load_dotenv()

In [None]:

# CONFIG
batch_size = 100
for_sft = False
for_dpo = True
prompt_length = 20
max_length = 196
# Get Model
model_name = 'google/gemma-3-270m'

base_model = get_model(model_name).to('cuda')
base_model = PeftModel.from_pretrained(base_model,
                                      'models/sft/best_model')
tok = get_tokenizer(model_name)

# Get data
base_data = get_data('train', 000, 3000)

In [None]:
first_responses = []
second_responses = []
first_response_scores = []
second_response_scores = []
# Loop from 0 to the end of the list, jumping by batch_size each time
for i in range(0, len(base_data), batch_size):
    # Slice the list to get the current batch
    batch = base_data[i : i + batch_size]

    print(f"Processing batch starting at index {i}: {batch[0][:20]}")
    first_response = generate_output(base_model, tok, batch, prompt_length, max_length)
    second_response = generate_output(base_model, tok, batch, prompt_length, max_length)
    first_responses.extend(first_response)
    second_responses.extend(second_response)
    first_response_scores.extend(get_rewards(reward_model, first_response))
    second_response_scores.extend(get_rewards(reward_model, second_response))


# Create a DataFrame from the lists
df = pd.DataFrame({
    'first_response': first_responses,
    'second_response': second_responses,
    'first_response_score': first_response_scores,
    'second_response_score': second_response_scores
})

# Apply the function to create the 'pref' column
df['pref'] = df.apply(determine_preference, axis=1)

# Display the first few rows of the DataFrame
display(df.head())
df.to_csv('data/dpo_data.csv', index=False)