In [1]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [37]:
import torch
import transformers
import datasets
from IPython.display import Markdown, display
import re
from sklearn.metrics.pairwise import cosine_similarity
import logging

logging.getLogger("transformers").setLevel(logging.ERROR)

torch.backends.cudnn.benchmark = True

In [3]:
# Load the wikitext dataset
dataset = datasets.load_dataset('wikitext', 'wikitext-2-raw-v1')

Downloading builder script:   0%|          | 0.00/8.48k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/6.84k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.25k [00:00<?, ?B/s]

Downloading and preparing dataset wikitext/wikitext-2-raw-v1 to /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126...


Downloading data:   0%|          | 0.00/4.72M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Dataset wikitext downloaded and prepared to /root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
# Preprocess and tokenize the dataset using the GPT-2 tokenizer
tokenizer = transformers.GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-1.3B')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# generator = transformers.pipeline('text-generation', model='gpt2')
generator = transformers.pipeline('text-generation', model='EleutherAI/gpt-neo-1.3B')

In [None]:
def tokenize_function(example):
    tokenized = tokenizer(example['text'], truncation=True, max_length=150, padding='max_length')
    return {'input_ids': tokenized['input_ids'], 'attention_mask': tokenized['attention_mask']}


tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
training_args = transformers.TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=1000,
    save_steps=1000,
    evaluation_strategy='steps',
    eval_steps=1000
)

In [None]:
# Define the model configuration
model_config = transformers.GPT2Config.from_pretrained(
    'EleutherAI/gpt-neo-1.3B', output_hidden_states=True)

In [None]:
# Instantiate the GPT-2 model and trainer
model = transformers.GPT2LMHeadModel(config=model_config)
trainer = transformers.Trainer(model=model, args=training_args,
                               train_dataset=tokenized_dataset['train'], eval_dataset=tokenized_dataset['validation'])

In [None]:
# Fine-tune the GPT-2 model on the dataset
trainer.train()

In [5]:
# Generate output

prompt = "How are clouds formed?"
response = generator(prompt, max_length=250, do_sample=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'How are clouds formed?\n\nWhat is the relationship between raindrop fall and rainfall?\n\nHow quickly and easily can the atmosphere be moistened after the cloud forms?\n\nHow much does a cloud reflect and absorb light, and what is the amount of total absorption and reflection that occurs?\n\nThere is a lot of research done on the topic of cloud microphysics, a topic that can be a real headache for everyone, from climate scientists to atmospheric scientists. I’m not going to talk about this topic since the topic is well-known, but I am going to give some basic information on the topic.\n\nWhat is Cloud Microphysics?\n\nSince raindrops form rain drops there are a number of questions that can be answered:\n\nWhere do raindrops form?\n\nHow do these raindrops fall?\n\nWhen do raindrops form?\n\nHow fast can they fall?\n\nWhat is the relationship between raindrop fall and'}]

In [46]:
response1 = response[0]["generated_text"]

response2 = "Clouds form when the invisible water vapor in the air condenses into visible water droplets or ice crystals. For this to happen, the parcel of air must be saturated, i.e. unable to hold all the water it contains in vapor form, so it starts to condense into a liquid or solid form."

In [51]:
def highlight_errors(prompt_tokens, response_tokens):

    # Find the index of the first token in the response that does not match the prompt
    error_index = len(prompt_tokens)
    for i in range(len(prompt_tokens)):
        if i >= len(response_tokens) or prompt_tokens[i] != response_tokens[i]:
            error_index = i
            break

    # Highlight incorrect parts of the response
    highlighted_response = ''
    for i, token in enumerate(response_tokens):
        if i < error_index:
            highlighted_response += token + ' '
        else:
            highlighted_response = highlighted_response + " [[" + token + "]] "

    return highlighted_response

In [56]:
def identify_improvements(response_1, response_2, similarity_threshold=0.7):
    tokenizer = transformers.AutoTokenizer.from_pretrained(
    "bert-base-cased")
    model = transformers.AutoModel.from_pretrained(
        "bert-base-cased")
    # Tokenize the prompt and response
    prompt_tokens = tokenizer(
        response_1, return_tensors='pt', padding=True, truncation=True)['input_ids']
    response_tokens = tokenizer(
        response_2, return_tensors='pt', padding=True, truncation=True)['input_ids']

    prompt_embeddings = model(prompt_tokens)[0].mean(dim=1)
    response_embeddings = model(response_tokens)[0].mean(dim=1)

    similarity = cosine_similarity(prompt_embeddings.detach(
    ).numpy(), response_embeddings.detach().numpy())[0][0]

    prompt_tokens = tokenizer.tokenize(response_1)
    response_tokens = tokenizer.tokenize(response_2)
    error_index = len(prompt_tokens)
    for i in range(len(prompt_tokens)):
        if i >= len(response_tokens) or prompt_tokens[i] != response_tokens[i]:
            error_index = i
            break
    error_part = " ".join(response_tokens[error_index:])

    highlighted_response = highlight_errors(prompt_tokens, response_tokens)

    return similarity, error_part, highlighted_response

In [60]:
similarity, error, highlight_response = identify_improvements(
    response2, response1)

if similarity >= 0.7:
    print(f"\nThe response answers the prompt with a score of %s.", (similarity))


The response answers the prompt with a score of %s. 0.8818619
