In [4]:
import os
import re
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel

def sentence_tokenizer(text):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    return sentences

def dataset_loader(text_paths, tokenizer):
    encoded_texts = []
    for text_path in text_paths:
        with open(text_path, "r", encoding="utf-8") as f:
            text = f.read()
        sentences = sentence_tokenizer(text)
        for sentence in sentences:
            encoded_text = tokenizer(sentence, return_tensors="tf", max_length=128, truncation=True, padding="max_length")
            encoded_texts.append(encoded_text)

    num_examples = len(encoded_texts)

    def generator():
        for encoded_text in encoded_texts:
            yield {
                "input_ids": encoded_text["input_ids"][0],
                "attention_mask": encoded_text["attention_mask"][0],
                "labels": encoded_text["input_ids"][0],
            }

    dataset = tf.data.Dataset.from_generator(
        generator,
        output_signature={
            "input_ids": tf.TensorSpec(shape=(128,), dtype=tf.int32),
            "attention_mask": tf.TensorSpec(shape=(128,), dtype=tf.int32),
            "labels": tf.TensorSpec(shape=(128,), dtype=tf.int32),
        }
    )

    options = tf.data.Options()
    options.experimental_deterministic = False
    dataset = dataset.with_options(options)

    return dataset, num_examples

# Instantiate the tokenizer and the model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = TFGPT2LMHeadModel.from_pretrained("gpt2")

# Load dataset
dir_path = r"C:\Users\serda\OneDrive\Bureau\Online Education\Certification\Projet final\Projetbooks\clean"
text_paths = [os.path.join(dir_path, "cleaned_annakarenina.txt")]
train_dataset, num_examples = dataset_loader(text_paths, tokenizer)

# Configure training
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss)

# Fine-tune the model
batch_size = 32
num_epochs = 3
steps_per_epoch = num_examples // batch_size
model.fit(train_dataset.batch(batch_size), epochs=num_epochs, steps_per_epoch=steps_per_epoch)

# Save the fine-tuned model
model.save_pretrained("./gpt2_finetuned")


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Epoch 1/3
Epoch 2/3
