In [3]:
import os
import re
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer

# Load the pre-trained model and tokenizer from Hugging Face Transformers library
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = TFGPT2LMHeadModel.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def paragraph_separater(teller):
    paragraphs = re.split(r'\n{2,}', teller) ### this is a paragraph seperator
    return paragraphs

def separated_text_groups(paragraphs, group_size=10):  ### group them as you wish but I advise 10
    grouped_paragraphs = []
    for i in range(0, len(paragraphs), group_size):
        group = ' '.join(paragraphs[i:i + group_size])
        grouped_paragraphs.append(group)
    return grouped_paragraphs

def dataset_loader(text_paths, tokenizer):  ###this will open your book(s) and tokenizer
    encoded_texts = []
    for text_path in text_paths:
        with open(text_path, "r", encoding="utf-8") as f:
            teller = f.read()
        paragraphs = paragraph_separater(teller)  ###tokenizers as above are up to HuggingFace Transformers Library
        for paragraph in paragraphs:
            encoded_text = tokenizer(paragraph, return_tensors="tf", max_length=128, truncation=True, padding="max_length")
            encoded_texts.append(encoded_text)
    
    options = tf.data.Options()
    options.experimental_deterministic = True
    dataset = dataset.with_options(options)


    num_examples = len(encoded_texts)

    def generator():
        for encoded_text in encoded_texts:
            yield {
                "input_ids": encoded_text["input_ids"][0],
                "attention_mask": encoded_text["attention_mask"][0],
                "labels": encoded_text["input_ids"][0],
            }

    dataset = tf.data.Dataset.from_generator(
        generator,
        output_signature={
            "input_ids": tf.TensorSpec(shape=(128,), dtype=tf.int32),
            "attention_mask": tf.TensorSpec(shape=(128,), dtype=tf.int32),
            "labels": tf.TensorSpec(shape=(128,), dtype=tf.int32),
        }
    )

    options = tf.data.Options()
    options.experimental_deterministic = False
    dataset = dataset.with_options(options)

    return dataset, num_examples

# Load dataset
dir_path = "C:/Users/serda/OneDrive\Bureau\Online Education/Certification/Projet final/Projetbooks/test_clean_10books" ### or if you use google drive link to find a book in the 'clean' folder in drive
text_paths = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if f.endswith(".txt")] ### Attention this will train all files .txt write explicit name of the book is better.
train_dataset, num_examples = dataset_loader(text_paths, tokenizer)

# Configure training
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5) # learning rate -try as you wish
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss)

# Fine-tune the model
batch_size = 32 ### try from 8 -to -128 and observe the loss
num_epochs = 3 # attention this may take really long! 3 not enough but it is up to your time!
steps_per_epoch = num_examples // batch_size  ### to compute how many steps rested in following
model.fit(train_dataset.batch(batch_size), epochs=num_epochs, steps_per_epoch=steps_per_epoch)


# Save the fine-tuned model
model.save_pretrained("./gpt2_finetuned")


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


UnboundLocalError: cannot access local variable 'dataset' where it is not associated with a value

In [4]:
import os
import re
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer

# Load the pre-trained model and tokenizer from Hugging Face Transformers library
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = TFGPT2LMHeadModel.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def paragraph_separater(teller):
    paragraphs = re.split(r'\n{2,}', teller) ### this is a paragraph seperator
    return paragraphs

def separated_text_groups(paragraphs, group_size=10):  ### group them as you wish but I advise 10
    grouped_paragraphs = []
    for i in range(0, len(paragraphs), group_size):
        group = ' '.join(paragraphs[i:i + group_size])
        grouped_paragraphs.append(group)
    return grouped_paragraphs

def dataset_loader(text_paths, tokenizer):  ###this will open your book(s) and tokenizer
    encoded_texts = []
    for text_path in text_paths:
        with open(text_path, "r", encoding="utf-8") as f:
            teller = f.read()
        paragraphs = paragraph_separater(teller)  ###tokenizers as above are up to HuggingFace Transformers Library
        for paragraph in paragraphs:
            encoded_text = tokenizer(paragraph, return_tensors="tf", max_length=128, truncation=True, padding="max_length")
            encoded_texts.append(encoded_text)
    
    options = tf.data.Options()
    options.experimental_deterministic = True
    dataset = dataset.with_options(options)


    num_examples = len(encoded_texts)

    def generator():
        for encoded_text in encoded_texts:
            yield {
                "input_ids": encoded_text["input_ids"][0],
                "attention_mask": encoded_text["attention_mask"][0],
                "labels": encoded_text["input_ids"][0],
            }

    dataset = tf.data.Dataset.from_generator(
        generator,
        output_signature={
            "input_ids": tf.TensorSpec(shape=(128,), dtype=tf.int32),
            "attention_mask": tf.TensorSpec(shape=(128,), dtype=tf.int32),
            "labels": tf.TensorSpec(shape=(128,), dtype=tf.int32),
        }
    )

    options = tf.data.Options()
    options.experimental_deterministic = False
    dataset = dataset.with_options(options)

    return dataset, num_examples

# Load dataset
dir_path = "C:/Users/serda/OneDrive\Bureau\Online Education/Certification/Projet final/Projetbooks/test_clean_10books" ### or if you use google drive link to find a book in the 'clean' folder in drive
text_paths = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if f.endswith(".txt")] ### Attention this will train all files .txt write explicit name of the book is better.
train_dataset, num_examples = dataset_loader(text_paths, tokenizer)

# Configure training
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5) # learning rate -try as you wish
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss)

# Fine-tune the model
batch_size = 32 ### try from 8 -to -128 and observe the loss
num_epochs = 3 # attention this may take really long! 3 not enough but it is up to your time!
steps_per_epoch = num_examples // batch_size  ### to compute how many steps rested in following
model.fit(train_dataset.batch(batch_size), epochs=num_epochs, steps_per_epoch=steps_per_epoch)


# Save the fine-tuned model
model.save_pretrained("./gpt2_finetuned")

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


UnboundLocalError: cannot access local variable 'dataset' where it is not associated with a value