In [25]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import json
import numpy as np
import torch
from transformers import AdamW
from torch.utils.data import Dataset, DataLoader

In [26]:
#instantiate model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = '[PAD]'
model = GPT2LMHeadModel.from_pretrained(model_name)

initialize new words

In [34]:
#add tokens 

# Load the JSON file containing the new words and their definitions
with open("data/finalWords.json", "r") as f:
    new_words = json.load(f)

# Get the current vocabulary size
original_vocab_size = len(tokenizer)

# Add the new words to the tokenizer
new_word_list = [word_obj["word"] for word_obj in new_words]
num_new_words = len(new_word_list)
num_added = tokenizer.add_tokens(new_word_list)
model.resize_token_embeddings(len(tokenizer))

#Compute the distribution from which we’ll sample:
params = model.state_dict()
embeddings = params['transformer.wte.weight']
pre_expansion_embeddings = embeddings[:-num_new_words,:]
mu = torch.mean(pre_expansion_embeddings, dim=0)
n = pre_expansion_embeddings.size()[0]
sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
dist = torch.distributions.multivariate_normal.MultivariateNormal(
        mu, covariance_matrix=1e-5*sigma)

#load in our new embeddings into the model:
new_embeddings = torch.stack(tuple((dist.sample() for _ in range(num_new_words))), dim=0)
embeddings[-num_new_words:,:] = new_embeddings
params['transformer.wte.weight'][-num_new_words:,:] = new_embeddings
model.load_state_dict(params)


<All keys matched successfully>

In [35]:
# Define a custom dataset to load the new words and their definitions
class NewWordsDataset(Dataset):
    def __init__(self, new_words):
        self.new_words = new_words
    
    def __len__(self):
        return len(self.new_words)
    
    def __getitem__(self, idx):
        item = self.new_words[idx]
        return item['definition'], item['word']

# Load the JSON file containing the new words and their definitions
with open("data/finalWords.json", "r") as f:
    new_words = json.load(f)

# Define a DataLoader for the new words dataset
new_words_dataset = NewWordsDataset(new_words)
new_words_dataloader = DataLoader(new_words_dataset, batch_size=4, shuffle=True)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [23]:
# Freeze all layers except for the classification layer
for name, param in model.named_parameters():
    if 'transformer' in name:
        param.requires_grad = False

# Define a new classification layer for the new words
num_new_words = len(new_word_list)
new_word_embeddings = model.transformer.wte.weight[-num_new_words:]
classification_layer = torch.nn.Linear(model.config.hidden_size, num_new_words)

# Define the optimizer and loss function
optimizer = AdamW(classification_layer.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()

model.to(device)
classification_layer.to(device)



Linear(in_features=768, out_features=100, bias=True)

In [33]:
# Define the training loop
# Train the model
for epoch in range(10):
    model.train()
    train_loss = 0
    for batch_idx, (definitions, words) in enumerate(new_words_dataloader):
        definitions = list(definitions)
        words = list(words)

        # Tokenize the input and output sequences
        input_ids = []
        attention_masks = []
        output_ids = []
        for i in range(len(definitions)):
            inputs = tokenizer.encode_plus(definitions[i], add_special_tokens=True, max_length=512, truncation=True, padding='max_length', return_tensors='pt')
            input_ids.append(inputs['input_ids'])
            attention_masks.append(inputs['attention_mask'])
            
            outputs = tokenizer.encode(words[i], add_special_tokens=False, return_tensors='pt')
            output_ids.append(outputs)
            
        # Convert lists to tensors
        input_ids = torch.stack(input_ids, dim=0).to(device)
        attention_masks = torch.stack(attention_masks, dim=0).to(device)
        output_ids = torch.stack(output_ids, dim=0).to(device)
        
        # Compute the model's logits and loss
        logits = model(input_ids=input_ids, attention_mask=attention_masks, labels=output_ids).logits
        loss = loss_fn(logits.view(-1, logits.size(-1)), output_ids.view(-1))
        
        # Backward propagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        
    print('Epoch: {}, Training Loss: {:.3f}'.format(epoch+1, train_loss / len(new_words_dataset)))


ValueError: Expected input batch_size (2044) to match target batch_size (0).

In [None]:
model.save_pretrained('fine-tuned-gpt2')
tokenizer.save_pretrained('fine-tuned-gpt2')