In [6]:
# Impor that will be use libraries
import torch
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling


In [9]:
dataset = load_dataset("tiny_shakespeare", trust_remote_code=True)


Downloading data: 100%|██████████| 1.12M/1.12M [00:00<00:00, 3.24MB/s]
Generating train split: 100%|██████████| 1/1 [00:00<00:00,  1.93 examples/s]
Generating validation split: 100%|██████████| 1/1 [00:00<00:00, 499.02 examples/s]
Generating test split: 100%|██████████| 1/1 [00:00<00:00, 444.45 examples/s]


In [10]:
# Load GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [11]:
# Set padding token
tokenizer.pad_token = tokenizer.eos_token

In [12]:
#Tokenize
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

In [13]:
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map: 100%|██████████| 1/1 [00:02<00:00,  2.43s/ examples]
Map: 100%|██████████| 1/1 [00:00<00:00,  6.16 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00,  5.24 examples/s]


In [14]:
# Preparing for training
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # GPT-2 is not trained with masked language modeling
)

In [None]:
# Split the dataset
tokenized_datasets = tokenized_datasets["train"].train_test_split(test_size=0.1)
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-shakespeare",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU supports it
)

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [None]:
# Train the model
trainer.train()

In [None]:
 #Save the fine-tuned model
trainer.save_model("./gpt2-shakespeare")

In [None]:

# Generate text using the fine-tuned model
def generate_text(prompt, max_length=50):
    model.eval()
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    if torch.cuda.is_available():
        input_ids = input_ids.to("cuda")
        model.to("cuda")
    output = model.generate(
        input_ids, 
        max_length=max_length, 
        num_return_sequences=1, 
        no_repeat_ngram_size=2, 
        top_k=50, 
        top_p=0.95, 
        temperature=0.7
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)


Testing it.

In [None]:

#for example giving short saying of "Haji Qdir Koyee"
prompt = "If life lacks light and englightenment, It is a merely a dream:"
generated_text = generate_text(prompt)
print("Generated Text:\n", generated_text)