In [1]:
import gc
import re
import pickle

import dill
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, AutoModel, GPT2Tokenizer
from torch.optim.lr_scheduler import ExponentialLR

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

torch.cuda.empty_cache()
gc.collect()

# from dataset import TextDataset, clean

840

In [2]:
df_og = pd.read_parquet("cspref/data/train-00000-of-00001.parquet")

# little bit of cleaning
for col in ['sent_1', 'sent_2']:
    df_og[col] = df_og[col].apply(lambda x:  x.split('\n')[0] if '\n' in x else x)

df = df_og.sample(n=int(df_og.shape[0] * 0.6), random_state=42)

chosen_class = {
    "sent_1": 0,
    "sent_2": 1,
    "tie": 2
}

df["chosen"] = df["chosen"].apply(lambda x: chosen_class[x])

class TextDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        self.tokenized_data = []
        self.label_dict = {
            0: torch.tensor([1, 0, 0]),
            1: torch.tensor([0, 1, 0]),
            2: torch.tensor([0, 0, 1])
        }

        kwargs = {
            # "add_special_tokens": True,
            "padding": "max_length",
            "truncation": True,
            "return_attention_mask": True,
            "return_tensors": "pt"
        }

        for idx in range(len(df)):
            prepend = (
                df.iloc[idx]["original_l1"]
                + "+"
                + df.iloc[idx]["original_l2"]
                + "->"
            )
            text1 = prepend + df.iloc[idx]["sent_1"]
            text2 = prepend + df.iloc[idx]["sent_2"]
            text1_enc = self.tokenizer.encode_plus(text1, **kwargs)
            text2_enc = self.tokenizer.encode_plus(text2, **kwargs)

            chosen = df.iloc[idx]["chosen"]
            label = self.label_dict[chosen]

            self.tokenized_data.append((text1_enc, text2_enc, chosen, label))

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, item):
        return self.tokenized_data[item]

In [3]:
class DecisionModel(nn.Module):
    def __init__(self):
        super(DecisionModel, self).__init__()
        self.fc1 = nn.Linear(768, 64)
        self.fc2 = nn.Linear(64, 1)
        self.reward_to_class = nn.Linear(2, 3)

    def forward(self, x1, x2):
        r1 = self.fc2(F.gelu(self.fc1(x1)))
        r2 = self.fc2(F.gelu(self.fc1(x2)))
        
        out = F.relu(torch.concat((r1, r2), dim=1))
        out = self.reward_to_class(out)
        out = F.log_softmax(out, dim=1)

        return out


class RewardModel(nn.Module):
    def __init__(self, enc_model, decision, device):
        super(RewardModel, self).__init__()
        self.enc_model = enc_model
        self.decision = decision
        self.device = device

    def forward(self, x):
        # x: [x1, x2, label]
        input_ids = x[0]['input_ids'].squeeze(dim=1).to(self.device)
        attention_mask = x[0]['attention_mask'].squeeze(dim=1).to(self.device)
        out1 = self.enc_model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        out1 = out1.hidden_states[-1][:, -1, :]

        input_ids = x[1]['input_ids'].squeeze(dim=1).to(self.device)
        attention_mask = x[1]['attention_mask'].squeeze(dim=1).to(self.device)
        out2 = self.enc_model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        out2 = out2.hidden_states[-1][:, -1, :]

        output = self.decision(out1, out2)
        label = x[-1].to(self.device)

        return output, label

    
MODEL = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.pad_token = tokenizer.eos_token

enc_model = AutoModel.from_pretrained("gpt2")

dm = DecisionModel()
rm = RewardModel(enc_model, dm, device)
rm.to(device)

print("Not anymore")

Not anymore


### Test Set Accuracy

In [4]:
MODEL = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.pad_token = tokenizer.eos_token

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
test_dataset = TextDataset(test_df, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=6, shuffle=True)

test_dataloader.dataset.df.shape

(7514, 7)

In [5]:
# rm.load_state_dict(torch.load("models_w_tie/gpt2_rm_w_tie_40_4.pth"))
checkpoint = torch.load("ckpts_nl/checkpoint_gpt2_60_5.pth")
rm.load_state_dict(checkpoint['model_state_dict'])

rm.eval()

all_preds = []
true_labels = []
chosen = []
pred_model_probs = []

for batch_id, batch in enumerate(test_dataloader):
    chosen += batch[2].tolist()
    out, label = rm(batch)
    probs = F.softmax(out, dim=1)
    pred_model_probs += probs.to("cpu").tolist()
    all_preds += probs.argmax(dim=1)
    true_labels += label.argmax(dim=1).to("cpu").tolist()

In [6]:
(torch.tensor(all_preds) == torch.tensor(true_labels)).sum() / len(all_preds)

tensor(0.5474)

### Train Set Accuracy

In [5]:
MODEL = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.pad_token = tokenizer.eos_token

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [7]:
train_dataset = TextDataset(train_df.sample(frac=0.25), tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=6, shuffle=True)

In [12]:
checkpoint = torch.load("ckpts_nl/checkpoint_gpt2_60_5.pth")
rm.load_state_dict(checkpoint['model_state_dict'])

all_preds = []
true_labels = []
chosen = []
pred_model_probs = []

for batch_id, batch in enumerate(train_dataloader):
    chosen += batch[2].tolist()
    out, label = rm(batch)
    probs = F.softmax(out, dim=1)
    pred_model_probs += probs.to("cpu").tolist()
    all_preds += probs.argmax(dim=1)
    true_labels += label.argmax(dim=1).to("cpu").tolist()

    if batch_id == 300:
        break

(torch.tensor(all_preds) == torch.tensor(true_labels)).sum() / len(all_preds)

tensor(0.5869)