In [7]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import json
import numpy as np
from torch.utils.data import Dataset, DataLoader

#instantiate model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)



initialize new words

In [12]:
#add tokens 

# Load the JSON file containing the new words and their definitions
with open("data/finalWords.json", "r") as f:
    new_words = json.load(f)

# Get the current vocabulary size
original_vocab_size = len(tokenizer)

# Add the new words to the tokenizer
new_word_list = [word_obj["word"] for word_obj in new_words]
num_added = tokenizer.add_tokens(new_word_list)
model.resize_token_embeddings(len(tokenizer))

# Compute mean and variance of existing embeddings
embedding_weights = model.transformer.wte.weight
mean = torch.mean(embedding_weights, dim=0)
variance = torch.mean((embedding_weights - mean) ** 2, dim=0)

# Generate embeddings for new words
num_new_words = len(new_word_list)
new_embeddings = torch.empty((num_new_words, embedding_weights.size()[1]), requires_grad=True)
for i in range(num_new_words):
    new_embeddings[i] = torch.from_numpy(
        np.random.normal(mean.detach().numpy(), np.sqrt(variance.detach().numpy()), size=(embedding_weights.size()[1],))
    )

# Add new embeddings to embedding matrix
model.resize_token_embeddings(len(tokenizer))
embedding_weights = model.transformer.wte.weight.detach().clone()
embedding_weights[-num_new_words:] = new_embeddings
model.transformer.wte.weight.data = embedding_weights


RuntimeError: a view of a leaf Variable that requires grad is being used in an in-place operation.

In [None]:
#Compute the distribution from which we’ll sample:
params = model.state_dict()
embeddings = params['transformer.wte.weight']
pre_expansion_embeddings = embeddings[:-(len(tokenizer)-original_vocab_size),:]
mu = torch.mean(pre_expansion_embeddings, dim=0)
n = pre_expansion_embeddings.size()[0]
sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
dist = torch.distributions.multivariate_normal.MultivariateNormal(
        mu, covariance_matrix=1e-5*sigma)


#load in our new embeddings into the model:
new_embeddings = torch.stack(tuple((dist.sample() for _ in range(3))), dim=0)
embeddings[-3:,:] = new_embeddings
params['transformer.wte.weight'][-3:,:] = new_embeddings
model.load_state_dict(params)

In [None]:
#load the corpus
with open('definitions.txt', 'r') as f:
    corpus = f.read()
    
#tokenize the corpus
tokenized_corpus = tokenizer.encode(corpus)

In [None]:
#split the tokenized corpus into chunks of tokens
max_length = 128
stride = 64

inputs = []
labels = []
for i in range(0, len(tokenized_corpus) - max_length, stride):
    inputs.append(tokenized_corpus[i:i+max_length])
    labels.append(tokenized_corpus[i+1:i+1+max_length])

#convert the inputs and labels into tensors
inputs = torch.tensor(inputs)
labels = torch.tensor(labels)

In [1]:
#fine-tune the model on custom corpus
class CustomDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

batch_size = 16
num_epochs = 10
learning_rate = 2e-5

dataset = CustomDataset(inputs, labels)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    running_loss = 0.0
    model.train()
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids, labels = batch
        input_ids = input_ids.to(device)
        labels = labels.to(device)
        outputs = model(input_ids, labels=labels)
        loss = loss_fn(outputs.logits.view(-1, outputs.logits.shape[-1]), labels.view(-1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print('Epoch {} loss: {:.4f}'.format(epoch+1, running_loss/len(dataloader)))

NameError: name 'Dataset' is not defined

In [None]:
model.save_pretrained('fine-tuned-gpt2')
tokenizer.save_pretrained('fine-tuned-gpt2')