In [1]:
import gc

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

from transformers import AutoTokenizer, AutoModel

from dataset import TextDataset
from model import DecisionModel, RewardModel

device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()
gc.collect()

60

In [2]:
df_test = pd.read_csv("data/calcs_test_split.csv")
chosen_class = {
    "sent_1": 0,
    "sent_2": 1,
    "tie": 2
}
df_test["chosen"] = df_test["chosen"].apply(lambda x: chosen_class[x])

In [3]:
MODEL = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.pad_token = tokenizer.eos_token

enc_model = AutoModel.from_pretrained("gpt2")

dm = DecisionModel()
rm = RewardModel(enc_model, dm, device)
rm.to(device)

RewardModel(
  (enc_model): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (decision): DecisionModel(
    (fc1): Linear(in_features=768, out_features=64, bias=True)
    (fc2): Linear(in_features=64, out_

In [4]:
batch_size = 8

test_dataset = TextDataset(df_test, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [5]:
checkpoint = torch.load("ckpt/gpt2_rm_epoch5.pth")

rm.load_state_dict(checkpoint['model_state_dict'])

rm.eval()

all_preds = []
chosen = []
pred_model_probs = []

for batch_id, batch in enumerate(test_dataloader):
    chosen += batch[2].tolist()
    out, label = rm(batch)
    probs = F.softmax(out, dim=1)

    pred_model_probs += probs.to("cpu").tolist()
    all_preds += probs.argmax(dim=1)

all_preds = [tensor.item() for tensor in all_preds]

In [6]:
diffs = np.arange(0, 1, 0.05)
f1s = []

for diff in diffs:
    custom_preds = [
        2 if np.abs(pred_model_probs[i][1] - pred_model_probs[i][0]) <= diff
        else np.argmax(pred_model_probs[i])
        for i in range(len(pred_model_probs))
    ]

    f1 = f1_score(chosen,custom_preds, average='macro')
    f1s.append(f1)
    
    
best_diff = diffs[f1s.index(max(f1s))]
print("Best f1:", max(f1s) * 100)

custom_preds = [
    2 if np.abs(pred_model_probs[i][1] - pred_model_probs[i][0]) <= best_diff
    else np.argmax(pred_model_probs[i])
    for i in range(len(pred_model_probs))
]


acc = [i == j for i, j in zip(custom_preds, chosen)]
print("Accuracy:", sum(acc)/ len(custom_preds))

Best f1: 52.337753851574575
Accuracy: 0.5281045751633987
