In [3]:
from utils.models import get_tokenizer, CustomModel
from utils.evaluation import evaluate_reward_model
from utils.data_preprocessing import CustomDataset, transform_df_for_dpo
from utils.loss_functions import RewardLoss
from utils.utils import get_reward
from utils.preference_generation import determine_preference

import torch
from torch.utils.data import DataLoader
from transformers import get_cosine_schedule_with_warmup

from tqdm import tqdm
import pandas as pd
import numpy as np
import gc, os
import glob

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


In [4]:
# --- Hyper‑parameters ---
wdir = '.'
MAX_LEN = 1024                         # truncate long GSM8K chains of thought
QUALITY = 'high'                        # Quality of Data to train on
OUTPUT_DIR = f"{wdir}/models/Reward_{QUALITY}"              # where to write LoRA adapter & tokenizer
BATCH_SIZE = 16
GRAD_ACCUM = 2                         # effective batch 32
LR = 1e-5
EPOCHS = 12
# Define total training steps
dataset_size = 10000
effective_batch_size = BATCH_SIZE * GRAD_ACCUM  # per_device_batch_size * num_gpus * grad_accum
TOTAL_STEP = (dataset_size // effective_batch_size + 1) * EPOCHS  # 684 steps
device = "cuda" if torch.cuda.is_available() else "cpu"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Training Hyperparameters
eval_steps = GRAD_ACCUM * 20 # every 20 steps of update, evaluate
score_steps = GRAD_ACCUM * 200 # every 200 steps of update, calculate ground truth score.
early_stopping_patience = 3

prompt_length = 20
max_length = 196

rng = np.random.default_rng(42)
scale = 1.0

In [5]:

# Get Model
model_name = 'google/gemma-3-270m'

reward_model = CustomModel(model_name, 1).to(device)

tok = get_tokenizer(model_name)

In [6]:
# Path to the folder containing the CSV files
path = f'{wdir}/data/'

# Get a list of all CSV files in the folder
all_files = glob.glob(path + f"dpo_data_{QUALITY}.csv")

print(all_files)
# Expected output: ['data/sales_jan.csv', 'data/sales_feb.csv', 'data/sales_mar.csv']
# Read each CSV file into a DataFrame and store them in a list
list_of_dfs = [pd.read_csv(file) for file in all_files]

# Concatenate all DataFrames in the list by row
df = pd.concat(list_of_dfs, ignore_index=True)

df['pref'] = df.apply(determine_preference,
                      axis=1,
                      args=(scale, rng)  # scale=1.0, rng=custom generator)
)
print(df.pref.value_counts())

# # --- DATA PROCESS ---
df = transform_df_for_dpo(df)

['/content/drive/Othercomputers/My MacBook Pro/Oracle DPO/data/dpo_data_high.csv']
pref
1 > 2    6021
2 > 1    5979
Name: count, dtype: int64


In [7]:
# --- Create DataLoaders ---
train_dataset = CustomDataset(df[:dataset_size], torch.zeros(dataset_size), torch.zeros(dataset_size))
eval_dataset = CustomDataset(df[dataset_size:], torch.zeros(len(df) - dataset_size), torch.zeros(len(df) - dataset_size))
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=lambda batch: batch)
eval_dataloader = DataLoader(eval_dataset, batch_size=BATCH_SIZE, collate_fn=lambda batch: batch)

In [8]:
# --- Training Components ---
loss_fn = RewardLoss()
# The optimizer will only see the trainable PEFT parameters
optimizer = torch.optim.AdamW(
    reward_model.parameters(),
    lr=LR,
    betas=(0.9, 0.95),
    eps=1e-8,
    weight_decay=0.01
)


# Cosine scheduler
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps = int(0.03 * TOTAL_STEP),
    num_training_steps = TOTAL_STEP
)

In [None]:
# --- Early Stopping and Model Saving Variables ---
best_eval_loss = float('inf')
patience_counter = 0
global_step = 0
best_eval_loss = evaluate_reward_model(reward_model, eval_dataloader, loss_fn, tok, device, max_length)
print(f"\nStep {global_step}: Validation Loss = {best_eval_loss:.4f}")
print("\n--- Starting Training ---")
for epoch in range(EPOCHS):

    # Note: optimizer.zero_grad() is now inside the accumulation block

    # Use enumerate to get the batch index 'i'
    for i, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch + 1}")):
        reward_model.train()
        # --- Forward Pass ---
        r1 = get_reward(reward_model, tok, batch, 'first_responses', device, max_length)
        r2 = get_reward(reward_model, tok, batch, 'second_responses', device, max_length)

        # Stack the tensor items
        choices = torch.stack([item['choices'] for item in batch])

        loss = loss_fn(
            r1,
            r2,
            choices.to(device)
        )

        # --- Scale the Loss and Backpropagate ---
        loss = loss / GRAD_ACCUM
        loss.backward()

        # --- Optimizer Step ---
        if (global_step + 1) % GRAD_ACCUM == 0:
            torch.nn.utils.clip_grad_norm_(reward_model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        # --- Evaluation and Early Stopping Logic ---
        # This block should be inside the optimizer step block
        global_step += 1 # Increment global_step only when weights are updated
        if global_step % eval_steps == 0:
            eval_loss = evaluate_reward_model(reward_model, eval_dataloader, loss_fn, tok, device, max_length)
            print(f"\nStep {global_step}: Validation Loss = {eval_loss:.4f}")
            # # 2. Save a checkpoint at every evaluation step

            if eval_loss < best_eval_loss:
                print(f"\nValidation loss improved from {best_eval_loss} to {eval_loss}. Saving model...")
                best_model_dir = os.path.join(OUTPUT_DIR, "best_model")
                reward_model.save_pretrained(best_model_dir)
                best_eval_loss = eval_loss
                patience_counter = 0  # Reset patience
            else:
                patience_counter += 1
                print(f"\nValidation loss did not improve. Patience: {patience_counter}/{early_stopping_patience}")

            if patience_counter >= early_stopping_patience:
                print("\nEarly stopping triggered.")
                break
        gc.collect()
        torch.cuda.empty_cache()

    if patience_counter >= early_stopping_patience:
        break

print("\n--- Training Finished ---")


Step 0: Validation Loss = 1.1852

--- Starting Training ---


Epoch 1:   6%|▌         | 39/625 [00:32<07:57,  1.23it/s]


Step 40: Validation Loss = 1.1828

Validation loss improved from 1.185245222568512 to 1.182830538749695. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 1:  13%|█▎        | 79/625 [01:36<07:29,  1.21it/s]


Step 80: Validation Loss = 1.1738

Validation loss improved from 1.182830538749695 to 1.1737848720550537. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 1:  19%|█▉        | 119/625 [02:41<06:53,  1.22it/s]


Step 120: Validation Loss = 1.1588

Validation loss improved from 1.1737848720550537 to 1.1587619705200196. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 1:  25%|██▌       | 159/625 [03:46<06:22,  1.22it/s]


Step 160: Validation Loss = 1.1398

Validation loss improved from 1.1587619705200196 to 1.1398430933952333. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 1:  32%|███▏      | 199/625 [04:51<05:51,  1.21it/s]


Step 200: Validation Loss = 1.1152

Validation loss improved from 1.1398430933952333 to 1.1151994438171386. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 1:  38%|███▊      | 239/625 [05:56<05:12,  1.24it/s]


Step 240: Validation Loss = 1.0886

Validation loss improved from 1.1151994438171386 to 1.0886326134204865. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 1:  45%|████▍     | 279/625 [07:01<04:38,  1.24it/s]


Step 280: Validation Loss = 1.0597

Validation loss improved from 1.0886326134204865 to 1.059672040939331. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 1:  51%|█████     | 319/625 [08:06<04:09,  1.23it/s]


Step 320: Validation Loss = 1.0359

Validation loss improved from 1.059672040939331 to 1.0359182860851288. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 1:  57%|█████▋    | 359/625 [09:11<03:37,  1.22it/s]


Step 360: Validation Loss = 1.0116

Validation loss improved from 1.0359182860851288 to 1.011569625377655. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 1:  64%|██████▍   | 399/625 [10:16<03:04,  1.22it/s]


Step 400: Validation Loss = 0.9873

Validation loss improved from 1.011569625377655 to 0.9872855415344238. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 1:  70%|███████   | 439/625 [11:20<02:30,  1.24it/s]


Step 440: Validation Loss = 0.9641

Validation loss improved from 0.9872855415344238 to 0.9641086857318878. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 1:  77%|███████▋  | 479/625 [12:25<02:00,  1.22it/s]


Step 480: Validation Loss = 0.9397

Validation loss improved from 0.9641086857318878 to 0.9396820120811462. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 1:  83%|████████▎ | 519/625 [13:30<01:26,  1.22it/s]


Step 520: Validation Loss = 0.9166

Validation loss improved from 0.9396820120811462 to 0.9165822377204895. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 1:  89%|████████▉ | 559/625 [14:35<00:53,  1.23it/s]


Step 560: Validation Loss = 0.8968

Validation loss improved from 0.9165822377204895 to 0.8968178353309632. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 1:  96%|█████████▌| 599/625 [15:40<00:21,  1.21it/s]


Step 600: Validation Loss = 0.8791

Validation loss improved from 0.8968178353309632 to 0.8790699784755707. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 1: 100%|██████████| 625/625 [16:33<00:00,  1.59s/it]
Epoch 2:   2%|▏         | 14/625 [00:11<08:25,  1.21it/s]


Step 640: Validation Loss = 0.8650

Validation loss improved from 0.8790699784755707 to 0.8650254328250885. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 2:   9%|▊         | 54/625 [01:16<07:51,  1.21it/s]


Step 680: Validation Loss = 0.8482

Validation loss improved from 0.8650254328250885 to 0.8482384884357452. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 2:  15%|█▌        | 94/625 [02:21<07:10,  1.23it/s]


Step 720: Validation Loss = 0.8363

Validation loss improved from 0.8482384884357452 to 0.8363179321289063. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 2:  21%|██▏       | 134/625 [03:26<06:46,  1.21it/s]


Step 760: Validation Loss = 0.8262

Validation loss improved from 0.8363179321289063 to 0.8261508009433747. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 2:  28%|██▊       | 174/625 [04:31<06:07,  1.23it/s]


Step 800: Validation Loss = 0.8161

Validation loss improved from 0.8261508009433747 to 0.8160845410823822. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 2:  34%|███▍      | 214/625 [05:36<05:38,  1.21it/s]


Step 840: Validation Loss = 0.8063

Validation loss improved from 0.8160845410823822 to 0.8062970585823059. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 2:  41%|████      | 254/625 [06:40<05:02,  1.23it/s]


Step 880: Validation Loss = 0.7954

Validation loss improved from 0.8062970585823059 to 0.7953723678588868. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 2:  47%|████▋     | 294/625 [07:45<04:30,  1.22it/s]


Step 920: Validation Loss = 0.7839

Validation loss improved from 0.7953723678588868 to 0.7839443027973175. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 2:  53%|█████▎    | 334/625 [08:50<04:00,  1.21it/s]


Step 960: Validation Loss = 0.7708

Validation loss improved from 0.7839443027973175 to 0.7707655649185181. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 2:  60%|█████▉    | 374/625 [09:56<03:27,  1.21it/s]


Step 1000: Validation Loss = 0.7616

Validation loss improved from 0.7707655649185181 to 0.7616042547225952. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 2:  66%|██████▌   | 414/625 [11:00<02:51,  1.23it/s]


Step 1040: Validation Loss = 0.7523

Validation loss improved from 0.7616042547225952 to 0.752272203207016. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 2:  73%|███████▎  | 454/625 [12:06<02:22,  1.20it/s]


Step 1080: Validation Loss = 0.7428

Validation loss improved from 0.752272203207016 to 0.7428235683441162. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 2:  79%|███████▉  | 494/625 [13:11<01:46,  1.23it/s]


Step 1120: Validation Loss = 0.7334

Validation loss improved from 0.7428235683441162 to 0.7334017810821534. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 2:  85%|████████▌ | 534/625 [14:16<01:14,  1.23it/s]


Step 1160: Validation Loss = 0.7246

Validation loss improved from 0.7334017810821534 to 0.7246115810871124. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 2:  92%|█████████▏| 574/625 [15:20<00:41,  1.22it/s]


Step 1200: Validation Loss = 0.7172

Validation loss improved from 0.7246115810871124 to 0.7171773688793183. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 2:  98%|█████████▊| 614/625 [16:25<00:08,  1.23it/s]


Step 1240: Validation Loss = 0.7066

Validation loss improved from 0.7171773688793183 to 0.7066478977203369. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 2: 100%|██████████| 625/625 [17:06<00:00,  1.64s/it]
Epoch 3:   5%|▍         | 29/625 [00:23<08:06,  1.22it/s]


Step 1280: Validation Loss = 0.6958

Validation loss improved from 0.7066478977203369 to 0.6958098232746124. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 3:  11%|█         | 69/625 [01:28<07:35,  1.22it/s]


Step 1320: Validation Loss = 0.6874

Validation loss improved from 0.6958098232746124 to 0.6874416098594666. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 3:  17%|█▋        | 109/625 [02:33<07:01,  1.22it/s]


Step 1360: Validation Loss = 0.6767

Validation loss improved from 0.6874416098594666 to 0.6766528949737549. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 3:  24%|██▍       | 149/625 [03:38<06:27,  1.23it/s]


Step 1400: Validation Loss = 0.6644

Validation loss improved from 0.6766528949737549 to 0.664355153799057. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 3:  30%|███       | 189/625 [04:43<05:55,  1.22it/s]


Step 1440: Validation Loss = 0.6554

Validation loss improved from 0.664355153799057 to 0.6553771061897278. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 3:  37%|███▋      | 229/625 [05:47<05:26,  1.21it/s]


Step 1480: Validation Loss = 0.6470

Validation loss improved from 0.6553771061897278 to 0.6470438721179962. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 3:  43%|████▎     | 269/625 [06:52<04:51,  1.22it/s]


Step 1520: Validation Loss = 0.6402

Validation loss improved from 0.6470438721179962 to 0.6402103655338287. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 3:  49%|████▉     | 309/625 [07:57<04:14,  1.24it/s]


Step 1560: Validation Loss = 0.6314

Validation loss improved from 0.6402103655338287 to 0.6314000465869903. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 3:  56%|█████▌    | 349/625 [09:02<03:47,  1.21it/s]


Step 1600: Validation Loss = 0.6251

Validation loss improved from 0.6314000465869903 to 0.6251292107105255. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 3:  62%|██████▏   | 389/625 [10:07<03:18,  1.19it/s]


Step 1640: Validation Loss = 0.6181

Validation loss improved from 0.6251292107105255 to 0.6181126239299775. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 3:  69%|██████▊   | 429/625 [11:13<02:43,  1.20it/s]


Step 1680: Validation Loss = 0.6130

Validation loss improved from 0.6181126239299775 to 0.6130406496524811. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 3:  75%|███████▌  | 469/625 [12:18<02:07,  1.23it/s]


Step 1720: Validation Loss = 0.6084

Validation loss improved from 0.6130406496524811 to 0.6083861331939697. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 3:  81%|████████▏ | 509/625 [13:23<01:35,  1.22it/s]


Step 1760: Validation Loss = 0.6032

Validation loss improved from 0.6083861331939697 to 0.6031974120140076. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 3:  88%|████████▊ | 549/625 [14:28<01:00,  1.25it/s]


Step 1800: Validation Loss = 0.6000

Validation loss improved from 0.6031974120140076 to 0.6000061695575714. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 3:  94%|█████████▍| 589/625 [15:33<00:29,  1.22it/s]


Step 1840: Validation Loss = 0.5987

Validation loss improved from 0.6000061695575714 to 0.598684506893158. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 3: 100%|██████████| 625/625 [16:34<00:00,  1.59s/it]
Epoch 4:   1%|          | 4/625 [00:03<08:26,  1.23it/s]


Step 1880: Validation Loss = 0.5947

Validation loss improved from 0.598684506893158 to 0.5947074162960052. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 4:   7%|▋         | 44/625 [01:07<07:55,  1.22it/s]


Step 1920: Validation Loss = 0.5912

Validation loss improved from 0.5947074162960052 to 0.5911607253551483. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 4:  13%|█▎        | 84/625 [02:12<07:14,  1.25it/s]


Step 1960: Validation Loss = 0.5900

Validation loss improved from 0.5911607253551483 to 0.5899518730640412. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 4:  20%|█▉        | 124/625 [03:17<06:47,  1.23it/s]


Step 2000: Validation Loss = 0.5867

Validation loss improved from 0.5899518730640412 to 0.5867218179702759. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 4:  26%|██▌       | 164/625 [04:22<06:12,  1.24it/s]


Step 2040: Validation Loss = 0.5837

Validation loss improved from 0.5867218179702759 to 0.5836510591506958. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 4:  33%|███▎      | 204/625 [05:26<05:39,  1.24it/s]


Step 2080: Validation Loss = 0.5814

Validation loss improved from 0.5836510591506958 to 0.581360092639923. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 4:  39%|███▉      | 244/625 [06:31<05:07,  1.24it/s]


Step 2120: Validation Loss = 0.5800

Validation loss improved from 0.581360092639923 to 0.5799989387989044. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 4:  45%|████▌     | 284/625 [07:36<04:37,  1.23it/s]


Step 2160: Validation Loss = 0.5778

Validation loss improved from 0.5799989387989044 to 0.5778457551002503. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 4:  52%|█████▏    | 324/625 [08:40<04:06,  1.22it/s]


Step 2200: Validation Loss = 0.5762

Validation loss improved from 0.5778457551002503 to 0.5762014937400818. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 4:  58%|█████▊    | 364/625 [09:45<03:35,  1.21it/s]


Step 2240: Validation Loss = 0.5739

Validation loss improved from 0.5762014937400818 to 0.5738753635883331. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 4:  65%|██████▍   | 404/625 [10:50<03:01,  1.22it/s]


Step 2280: Validation Loss = 0.5726

Validation loss improved from 0.5738753635883331 to 0.5726211800575256. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 4:  71%|███████   | 444/625 [11:55<02:25,  1.24it/s]


Step 2320: Validation Loss = 0.5724

Validation loss improved from 0.5726211800575256 to 0.5724491498470307. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 4:  77%|███████▋  | 484/625 [13:00<01:53,  1.24it/s]


Step 2360: Validation Loss = 0.5717

Validation loss improved from 0.5724491498470307 to 0.5717455012798309. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 4:  84%|████████▍ | 524/625 [14:05<01:22,  1.23it/s]


Step 2400: Validation Loss = 0.5714

Validation loss improved from 0.5717455012798309 to 0.5714235010147095. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 4:  90%|█████████ | 564/625 [15:09<00:49,  1.22it/s]


Step 2440: Validation Loss = 0.5671

Validation loss improved from 0.5714235010147095 to 0.567115920305252. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 4:  97%|█████████▋| 605/625 [16:47<03:27, 10.40s/it]


Step 2480: Validation Loss = 0.5689

Validation loss did not improve. Patience: 1/3


Epoch 4: 100%|██████████| 625/625 [17:03<00:00,  1.64s/it]
Epoch 5:   3%|▎         | 19/625 [00:15<08:07,  1.24it/s]


Step 2520: Validation Loss = 0.5662

Validation loss improved from 0.567115920305252 to 0.5661882257461548. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 5:   9%|▉         | 59/625 [01:20<07:45,  1.22it/s]


Step 2560: Validation Loss = 0.5642

Validation loss improved from 0.5661882257461548 to 0.5642106273174285. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 5:  16%|█▌        | 100/625 [02:57<1:30:56, 10.39s/it]


Step 2600: Validation Loss = 0.5647

Validation loss did not improve. Patience: 1/3


Epoch 5:  22%|██▏       | 139/625 [03:29<06:36,  1.23it/s]


Step 2640: Validation Loss = 0.5632

Validation loss improved from 0.5642106273174285 to 0.5632181458473206. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 5:  29%|██▉       | 180/625 [05:06<1:16:55, 10.37s/it]


Step 2680: Validation Loss = 0.5635

Validation loss did not improve. Patience: 1/3


Epoch 5:  35%|███▌      | 219/625 [05:38<05:30,  1.23it/s]


Step 2720: Validation Loss = 0.5616

Validation loss improved from 0.5632181458473206 to 0.561628082036972. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 5:  41%|████▏     | 259/625 [06:43<05:01,  1.21it/s]


Step 2760: Validation Loss = 0.5594

Validation loss improved from 0.561628082036972 to 0.5593965933322906. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 5:  48%|████▊     | 299/625 [07:48<04:24,  1.23it/s]


Step 2800: Validation Loss = 0.5588

Validation loss improved from 0.5593965933322906 to 0.5587833433151245. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 5:  54%|█████▍    | 339/625 [08:52<03:49,  1.25it/s]


Step 2840: Validation Loss = 0.5586

Validation loss improved from 0.5587833433151245 to 0.5586131584644317. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 5:  61%|██████    | 379/625 [09:57<03:23,  1.21it/s]


Step 2880: Validation Loss = 0.5577

Validation loss improved from 0.5586131584644317 to 0.5577333941459656. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 5:  67%|██████▋   | 419/625 [11:02<02:48,  1.23it/s]


Step 2920: Validation Loss = 0.5564

Validation loss improved from 0.5577333941459656 to 0.5564272837638855. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 5:  73%|███████▎  | 459/625 [12:07<02:17,  1.21it/s]


Step 2960: Validation Loss = 0.5560

Validation loss improved from 0.5564272837638855 to 0.5560234136581421. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 5:  80%|███████▉  | 499/625 [13:12<01:41,  1.24it/s]


Step 3000: Validation Loss = 0.5549

Validation loss improved from 0.5560234136581421 to 0.5549192559719086. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 5:  86%|████████▋ | 540/625 [14:49<14:42, 10.38s/it]


Step 3040: Validation Loss = 0.5555

Validation loss did not improve. Patience: 1/3


Epoch 5:  93%|█████████▎| 580/625 [15:54<07:48, 10.42s/it]


Step 3080: Validation Loss = 0.5552

Validation loss did not improve. Patience: 2/3


Epoch 5:  99%|█████████▉| 619/625 [16:26<00:04,  1.24it/s]


Step 3120: Validation Loss = 0.5538

Validation loss improved from 0.5549192559719086 to 0.553813348531723. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 5: 100%|██████████| 625/625 [17:03<00:00,  1.64s/it]
Epoch 6:   6%|▌         | 35/625 [01:00<1:42:09, 10.39s/it]


Step 3160: Validation Loss = 0.5550

Validation loss did not improve. Patience: 1/3


Epoch 6:  12%|█▏        | 74/625 [01:32<07:38,  1.20it/s]


Step 3200: Validation Loss = 0.5533

Validation loss improved from 0.553813348531723 to 0.5532972946166992. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 6:  18%|█▊        | 115/625 [03:10<1:28:13, 10.38s/it]


Step 3240: Validation Loss = 0.5541

Validation loss did not improve. Patience: 1/3


Epoch 6:  25%|██▍       | 154/625 [03:42<06:28,  1.21it/s]


Step 3280: Validation Loss = 0.5533

Validation loss improved from 0.5532972946166992 to 0.5532647149562836. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 6:  31%|███       | 194/625 [04:47<05:51,  1.23it/s]


Step 3320: Validation Loss = 0.5529

Validation loss improved from 0.5532647149562836 to 0.5529279625415802. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 6:  37%|███▋      | 234/625 [05:52<05:20,  1.22it/s]


Step 3360: Validation Loss = 0.5522

Validation loss improved from 0.5529279625415802 to 0.5522204844951629. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 6:  44%|████▍     | 274/625 [06:57<04:47,  1.22it/s]


Step 3400: Validation Loss = 0.5514

Validation loss improved from 0.5522204844951629 to 0.5513839502334594. Saving model...
Saving model weights to /content/drive/Othercomputers/My MacBook Pro/Oracle DPO/models/Reward_high/best_model


Epoch 6:  50%|█████     | 315/625 [08:34<53:35, 10.37s/it]


Step 3440: Validation Loss = 0.5517

Validation loss did not improve. Patience: 1/3


Epoch 6:  57%|█████▋    | 355/625 [09:38<46:43, 10.38s/it]


Step 3480: Validation Loss = 0.5543

Validation loss did not improve. Patience: 2/3


Epoch 6:  63%|██████▎   | 394/625 [10:43<06:17,  1.63s/it]


Step 3520: Validation Loss = 0.5556

Validation loss did not improve. Patience: 3/3

Early stopping triggered.

--- Training Finished ---



