In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import torch.nn as nn
import json
from tqdm import tqdm
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, AutoModelForCausalLM, GPT2DoubleHeadsModel
from torch.nn.utils.rnn import pad_sequence

model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.add_special_tokens({'mask_token': '[MASK]'})

1

In [2]:
# Load the JSON file
with open('../data/finalWords.json') as f:
    words = json.load(f)

model_name = 'gpt2'

# Add the new words to the tokenizer
new_word_list = [str(word_obj["word"]) for word_obj in words]
new_def_list = [str(word_obj["definition"]) for word_obj in words]

# Check if each new word is already in the GPT2 vocabulary
words_to_add = []
vocab = tokenizer.get_vocab()
for word in new_word_list:
    if word not in vocab:
        words_to_add.append(word)
        
for definition in new_def_list:
    for word in definition.split():
        if word not in vocab:
            words_to_add.append(word)

num_new_words = len(new_word_list)
num_added = tokenizer.add_tokens(words_to_add)
model.resize_token_embeddings(len(tokenizer))

Embedding(54473, 768)

In [3]:
training_data = []
for i, x in enumerate(words):
    word1 = x['word']
    definition_str1 = x['definition']
    word1_onehot = tokenizer.encode(word1, add_special_tokens=False, return_tensors='pt').squeeze()
    # Get second word and definition
    j = i + 1
    if j == len(words):
        break

    x = words[j]

    word2 = x['word']
    definition_str2 = x['definition']
    word2_onehot = tokenizer.encode(word2, add_special_tokens=False, return_tensors='pt').squeeze()
    # Get example sentence and mask out target word
    example = x['example']

    check = -1
    masked_example = ''
    if word1 in example:
        masked_example = example.replace(word1, '[MASK]')
        training_data.append((word1, definition_str1, word2, definition_str2, masked_example, word1_onehot)) #recheck
    elif word2 in example:
        masked_example = example.replace(word2, '[MASK]')
        training_data.append((word1, definition_str1, word2, definition_str2, masked_example, word2_onehot)) #recheck
print(len(training_data))
print(training_data[0])

749
('Janky', 'Undesirable; less-than optimum.', 'brutal', 'anything that makes you sweat', "Man, this morning's calisthenics were [MASK].", tensor(50261))


In [4]:
def pad_data(data):
    word1 = [x[0] for x in data]
    sent1 = [x[1] for x in data]
    word2 = [x[2] for x in data]
    sent2 = [x[3] for x in data]
    example = [x[4] for x in data]
    labels = [x[5] for x in data]

    if any(isinstance(l, list) for l in labels):
        labels = [item for sublist in labels for item in sublist]

    combined_text = []
    for w1, s1, w2, s2, ex in zip(word1, sent1, word2, sent2, example):
        combined_text.append(w1 + " " + s1 + " " + w2 + " " + s2 + " " + ex)
        
    encoding = tokenizer(combined_text, return_tensors='pt', padding=True, truncation=True)

    input_ids = torch.LongTensor(encoding['input_ids'])
    attention_mask = torch.LongTensor(encoding['attention_mask'])
    labels = torch.LongTensor(labels)
    return (input_ids, attention_mask, labels)

In [5]:
# Define the training parameters
batch_size = 5
num_epochs = 3
learning_rate = 5e-5

# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# Fine-tune the model on the training data
model.train()
for epoch in range(num_epochs):
    epoch_loss = 0.0
    num_batches = 0
    max_len = 0
    for i in tqdm(range(0, len(training_data), batch_size)):
        batch_data = training_data[i:i+batch_size]
        input_ids, attention_mask, labels = pad_data(batch_data)
        optimizer.zero_grad()

        #Creates a binary mask indicating where the [MASK] token is located in the input IDs, 
        #then shifts it one position to the left to align with the target labels.
        temp_mask = input_ids == torch.tensor(tokenizer.convert_tokens_to_ids('[MASK]'))      
        temp_mask = temp_mask[:, 1:].squeeze(-1)

        #Runs the input IDs through the model to get the output logits, then slices off the last token to match the length of the target labels.
        outputs = model(input_ids, attention_mask=attention_mask).logits[:, :-1, :]

        #Applies the binary mask to the output logits to get the values corresponding to the [MASK] tokens.
        filtered_outputs = outputs[temp_mask] 

        #if filtered_outputs.shape[0] == 8:
            #import pdb; pdb.set_trace()

        #filter bad batches, perform backpropagation, and update the model parameters
        if filtered_outputs.shape[0] == 5:
            loss = criterion(filtered_outputs.reshape(-1, len(tokenizer)), labels.reshape(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
        
        epoch_loss += loss.item()
        num_batches += 1
    print("Epoch {} loss: {:.4f}".format(epoch+1, epoch_loss/num_batches))

  7%|â–‹         | 11/150 [00:08<01:43,  1.35it/s]


KeyboardInterrupt: 

In [36]:
from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel

# Assuming your fine-tuned model is stored in a variable called `model`
model.save_pretrained("../models/paradigm-model")

# Assuming your fine-tuned tokenizer is stored in a variable called `tokenizer`
tokenizer.save_pretrained("../models/paradigm-token")

model_path = "../models/paradigm-model"
tokenizer_path = "../models/paradigm-token"

tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
model = GPT2DoubleHeadsModel.from_pretrained(model_path)

Some weights of GPT2DoubleHeadsModel were not initialized from the model checkpoint at ../models/paradigm-model and are newly initialized: ['multiple_choice_head.summary.bias', 'multiple_choice_head.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
questions = []
answers = []
for i, x in enumerate(words):
    options = []
    word1 = x['word']
    definition_str1 = x['definition']
    # Get second word and definition
    j = i + 1
    if j == len(words):
        break

    x = words[j]

    word2 = x['word']
    definition_str2 = x['definition']
    word2_onehot = tokenizer.encode(word2, add_special_tokens=False, return_tensors='pt').squeeze()
    # Get example sentence and mask out target word
    example = x['example']

    check = -1
    masked_example = ''
    masked_example = example.replace(word1, word2)
    options.append(word1 + " " + definition_str1 + " " + masked_example + " [CLS]")
    options.append(word2 + " " + definition_str2 + " " + example + " [CLS]") 
    questions.append(options)
    answers.append(1)
print(questions[2])



['brutal anything that makes you sweat That skanky ho-bag wants to borrow your homework. [CLS]', 'skanky Anything of or pertaining to a $10,000 hooker. That skanky ho-bag wants to borrow your homework. [CLS]']


In [None]:
# Add a [CLS] to the vocabulary (we should train it also!)
num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"})
# Update the model embeddings with the new vocabulary size
embedding_layer = model.resize_token_embeddings(len(tokenizer))

In [None]:
count0 = 0.0
total = 0.0
for i, choices in enumerate(questions):
    encoded_choices = [tokenizer.encode(s) for s in choices]
    cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
    #check if some choices are longer than the others
    max_len = max([len(tokens) for tokens in encoded_choices])
    #print(max_len)
    check = True

    for tokens in encoded_choices:
        if len(tokens) < max_len:
            check = False
    if check:
        input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
        mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1
        outputs = model(input_ids, mc_token_ids=mc_token_ids)
        lm_logits = outputs.logits
        mc_logits = outputs.mc_logits
        if mc_logits.argmax().item() == 0:
            count0 += 1
        
        total += 1
print(count0/total)

print(len(questions))