In [1]:
from transformers import pipeline, AutoTokenizer, AutoModelWithLMHead
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import json
import random
from transformers import AutoTokenizer, GPT2DoubleHeadsModel

In [3]:
# Load the JSON file
with open('../data/finalWords.json') as f:
    words = json.load(f)

model_name = 'gpt2'
#tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#tokenizer.pad_token = '[PAD]'

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = GPT2DoubleHeadsModel.from_pretrained("gpt2")
# Get the current vocabulary size
original_vocab_size = len(tokenizer)
#print(tokenizer.convert_ids_to_tokens(tokenizer('bitches')['input_ids']))
#print(tokenizer.convert_ids_to_tokens(tokenizer('mission')['input_ids']))
# Add the new words to the tokenizer
new_word_list = [str(word_obj["word"]) for word_obj in words]
num_new_words = len(new_word_list)
#print(new_word_list[0])
#print(tokenizer.convert_ids_to_tokens(tokenizer((new_word_list[0]))['input_ids']))
num_added = tokenizer.add_tokens(new_word_list)
#print(tokenizer.convert_ids_to_tokens(tokenizer((new_word_list[0]))['input_ids']))
model.resize_token_embeddings(len(tokenizer))

#Compute the distribution from which we’ll sample:
params = model.state_dict()
embeddings = params['transformer.wte.weight']
pre_expansion_embeddings = embeddings[:-num_new_words,:]
mu = torch.mean(pre_expansion_embeddings, dim=0)
n = pre_expansion_embeddings.size()[0]
sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
dist = torch.distributions.multivariate_normal.MultivariateNormal(
        mu, covariance_matrix=1e-5*sigma)

#load in our new embeddings into the model:
new_embeddings = torch.stack(tuple((dist.sample() for _ in range(num_new_words))), dim=0)
embeddings[-num_new_words:,:] = new_embeddings
params['transformer.wte.weight'][-num_new_words:,:] = new_embeddings
model.load_state_dict(params)
#print(tokenizer.convert_ids_to_tokens(tokenizer("Man, this morning's calisthenics were bitches. [CLS]")['input_ids']))
#print(tokenizer.convert_ids_to_tokens(tokenizer("Man, this morning's calisthenics were mission. [CLS]")['input_ids']))


Some weights of GPT2DoubleHeadsModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['multiple_choice_head.summary.bias', 'multiple_choice_head.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [4]:
# Initialize the quiz questions and answers
quiz_questions = []
quiz_answers = []

# Loop through each word in the JSON file
for word in words:
    # Get the example sentence for this word
    example = word['example']
    # Get a list of 3 random words to use as distractors
    distractors = random.sample([w['word'] for w in words if w['word'] != word['word']], 3)
    # Replace the word in the example sentence with each distractor and the correct answer
    options = [example.replace(word['word'], word_choice) + " [CLS]" for word_choice in distractors + [word['word']]]
    # Shuffle the options
    random.shuffle(options)
    # Add the question and answer to the quiz arrays
    quiz_questions.append(options)
    quiz_answers.append(options.index(example + " [CLS]"))



In [5]:

# Add a [CLS] to the vocabulary (we should train it also!)
num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"})
# Update the model embeddings with the new vocabulary size
embedding_layer = model.resize_token_embeddings(len(tokenizer))

questions = quiz_questions
answers = quiz_answers

count = 0.0
total = 0.0
for i, choices in enumerate(questions):
    #print(choices)
    encoded_choices = [tokenizer.encode(s) for s in choices]
    cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
    #check if some choices are longer than the others
    max_len = max([len(tokens) for tokens in encoded_choices])
    check = True
    for tokens in encoded_choices:
        if len(tokens) < max_len:
            check = False
    if check:
        #print(cls_token_location)
        #print(encoded_choices)
        input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
        mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1
        outputs = model(input_ids, mc_token_ids=mc_token_ids)
        lm_logits = outputs.logits
        mc_logits = outputs.mc_logits
        if mc_logits.argmax().item() == answers[i]:
            count += 1
        total += 1

print('total: ', total)
print('correct: ', count)
print('accuracy: ', count/total)

total:  5713.0
correct:  2853.0
accuracy:  0.4993873621564852


Experimentation with multiple choice

In [75]:
import torch
from transformers import AutoTokenizer, GPT2DoubleHeadsModel

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = GPT2DoubleHeadsModel.from_pretrained("gpt2")

# Add a [CLS] to the vocabulary (we should train it also!)
num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"})
# Update the model embeddings with the new vocabulary size
embedding_layer = model.resize_token_embeddings(len(tokenizer))

choices = ["Hello, my snake is cute [CLS]", "Hello, my frog is cute [CLS]", "Hello, my puppy is cute [CLS]"]
encoded_choices = [tokenizer.encode(s) for s in choices]
cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]

input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1

outputs = model(input_ids, mc_token_ids=mc_token_ids)
lm_logits = outputs.logits
mc_logits = outputs.mc_logits
print(mc_logits.argmax().item())
print(choices[mc_logits.argmax().item()])

Some weights of GPT2DoubleHeadsModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['multiple_choice_head.summary.bias', 'multiple_choice_head.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0
Hello, my snake is cute [CLS]
