<a href="https://colab.research.google.com/github/shahabday/BAMline4CT/blob/main/DSR_41_HuggingFace_GPT2_Finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Generation with Hugging Face.

# Part 1: Dealing with the dataset.

---

## Check if a GPU is available.

If not, activate it in Runtime -> Change Runtime Type.

In [None]:
!pip install -U transformers[torch] datasets accelerate

In [None]:
import tensorflow as tf
import glob
import os
import shutil
import tqdm
import random
import matplotlib.pyplot as plt
import torch
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from transformers import PreTrainedTokenizerFast
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2Config, GPT2LMHeadModel
from transformers import TrainingArguments, Trainer

tf.config.list_physical_devices("GPU")

## Load the dataset. First from the internet. Then from the hard drive.


In [None]:
dataset_file = "dataset.txt"

# How many files to load.
file_number = 100

# Clone the repo.
!git clone https://github.com/vilmibm/lovecraftcorpus

# Find all the files.
paths = glob.glob("lovecraftcorpus/*.txt")

# Do not use all.
paths = paths[:file_number]
print(sorted(paths))

# Merge.
with open(dataset_file, "w") as output_file:
    for path in paths:
        for line in open(path, "r"):
            for split in line.split("\n"):
                split = split.strip()
                if split != "":
                    print(split, file=output_file)

# Delete repo.
!rm -rf lovecraftcorpus

# Done.
print("Corpus downloaded.")

In [None]:
raw_datasets = load_dataset("text", data_files=[dataset_file])
raw_datasets

Let us look at an example.

In [None]:
for index in range(10):
    token_sequence = raw_datasets["train"][index]["text"]
    print(token_sequence)

# Part 2: Training GPT-2.

---

## Train the tokenizer.

In [None]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(vocab_size=5000, special_tokens=["[UNK]", "[PAD]"])
tokenizer.pre_tokenizer = Whitespace()

def batch_iterator(batch_size=1000):
    for i in range(0, len(raw_datasets["train"]), batch_size):
        yield raw_datasets["train"][i : i + batch_size]["text"]

tokenizer.train_from_iterator(batch_iterator(), trainer=trainer, length=len(raw_datasets["train"]))
tokenizer.save("tokenizer.json")

tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
tokenizer.vocab_size

In [None]:
tokenizer.vocab

In [None]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

## Tokenize some samples.

Inspect the vocabulary.

In [None]:
token_sequence = raw_datasets["train"][3]["text"]
print(token_sequence)

indices = tokenizer(token_sequence)["input_ids"]
print(indices)

tokens = [tokenizer.decode([index]) for index in indices]
print(tokens)
print(len(tokens))

In [None]:
lengths = []
for token_sequence in tqdm.tqdm(raw_datasets["train"]):
    token_sequence = token_sequence["text"]
    indices = tokenizer(token_sequence)["input_ids"]
    lengths += [len(indices)]

plt.hist(lengths, bins=50)
plt.show()

## Train the model.

Prepare the tokenization function.

In [None]:
sequence_length = 256

def tokenize_function(example):
    tokenized_example = tokenizer(
        example["text"],
        truncation=True,
        padding=True,
        max_length=sequence_length,
    )
    return {
        "input_ids": tokenized_example["input_ids"]
    }

# Check a sample.
token_sequence = raw_datasets["train"][0]
print(token_sequence)
tokenized = tokenize_function(token_sequence)
assert list(tokenized.keys()) == ["input_ids"], list(tokenized.keys())
print(tokenized)

Create the tokenized dataset.

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, remove_columns=raw_datasets["train"].column_names)

# Check a sample.
tokenized = tokenized_datasets["train"][0]
assert list(tokenized.keys()) == ["input_ids"], list(tokenized.keys())
print(tokenized)

Instantiate a data collator.

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

Create the model.

In [None]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
model

## Test the data collator and the model.

In [None]:
inputs = [tokenized_datasets["train"][2]]
inputs = data_collator(inputs)
assert list(inputs.keys()) == ["input_ids", "attention_mask", "labels"], list(inputs.keys())
print("input_ids:", inputs["input_ids"])
print("")

outputs = model(**inputs)
assert list(outputs.keys()) == ["loss", "logits", "past_key_values"], list(outputs.keys())
print("logits:", outputs["logits"])

plt.plot(outputs["logits"].detach().numpy()[0][0])
plt.title("Logits")
plt.show()
plt.close()

activations = torch.nn.functional.softmax(outputs["logits"], dim=-1)
plt.plot(activations.detach().numpy()[0][0])
plt.title("Activations")
plt.show()
plt.close()

---

## Run the training.


In [None]:
# Get the output directory with timestamp.
output_path = "output"

# Create the trainer.
print("Creating trainer...")
training_args = TrainingArguments(
    output_dir=output_path,
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=72,
    prediction_loss_only=False,
    #report_to="none"
)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
)

# Train the model.
trainer.train()

# Save the tokenizer.
tokenizer.save_pretrained(output_path)

# Save the model.
model.save_pretrained(output_path)

In [None]:
inputs = [tokenized_datasets["train"][2]]
inputs = data_collator(inputs)
assert list(inputs.keys()) == ["input_ids", "attention_mask", "labels"], list(inputs.keys())
print("input_ids:", inputs["input_ids"])
print("")

outputs = model(**inputs.to("cuda:0"))
assert list(outputs.keys()) == ["loss", "logits", "past_key_values"], list(outputs.keys())
print("logits:", outputs["logits"])

plt.plot(outputs["logits"].cpu().detach().numpy()[0][0])
plt.title("Logits")
plt.show()
plt.close()

activations = torch.nn.functional.softmax(outputs["logits"], dim=-1)
plt.plot(activations.cpu().detach().numpy()[0][0])
plt.title("Activations")
plt.show()
plt.close()

## How to generate.

In [None]:
model.to("cuda")

# Encode the conditioning tokens.
input_ids = tokenizer.encode("The most merciful thing in the world, I think, is the inability of the human mind to correlate all its contents.", return_tensors="pt").cuda()
print(input_ids)

# Generate more tokens.
generated_ids = model.generate(
    input_ids,
    max_length=100,
    do_sample=True,
    temperature=0.5
)
generated_sequence = tokenizer.decode(generated_ids[0], clean_up_tokenization_spaces=True)
print(generated_sequence)

#Thank you!