In [1]:
import torch
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer, pipeline

In [19]:
# Load the text file
with open("sentences.txt", "r", encoding="utf-8") as f:
    lines = f.read().splitlines()

# Create a Hugging Face dataset
dataset = Dataset.from_dict({"text": lines})

Dataset({
    features: ['text'],
    num_rows: 1238
})


In [20]:
# Split the dataset into 90% train and 10% validation
dataset = dataset.train_test_split(test_size=0.1)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1114
    })
    test: Dataset({
        features: ['text'],
        num_rows: 124
    })
})


In [21]:
# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to the end-of-sequence token

In [22]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Add labels (required for GPT-2 training)
def add_labels(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples

tokenized_datasets = tokenized_datasets.map(add_labels, batched=True)

Map:   0%|          | 0/1114 [00:00<?, ? examples/s]

Map:   0%|          | 0/124 [00:00<?, ? examples/s]

Map:   0%|          | 0/1114 [00:00<?, ? examples/s]

Map:   0%|          | 0/124 [00:00<?, ? examples/s]

In [23]:
# Set the dataset format to PyTorch
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [24]:
# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))  # Adjust for the tokenizer size

Embedding(50257, 768)

In [25]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",    # Evaluate at the end of each epoch
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    fp16=True,  # Use mixed precision if available
)

In [26]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,  # Pass the tokenizer for logging convenience
)

  trainer = Trainer(


In [27]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.757391
2,No log,0.742824
3,No log,0.741616


TrainOutput(global_step=420, training_loss=0.9109887259347098, metrics={'train_runtime': 16.9451, 'train_samples_per_second': 197.225, 'train_steps_per_second': 24.786, 'total_flos': 218309492736000.0, 'train_loss': 0.9109887259347098, 'epoch': 3.0})

In [28]:
model.save_pretrained("./fine_tuned_gpt2")
tokenizer.save_pretrained("./fine_tuned_gpt2")

('./fine_tuned_gpt2\\tokenizer_config.json',
 './fine_tuned_gpt2\\special_tokens_map.json',
 './fine_tuned_gpt2\\vocab.json',
 './fine_tuned_gpt2\\merges.txt',
 './fine_tuned_gpt2\\added_tokens.json')

In [16]:
# Generate text
text = "Why must humans fight amongst themselves?"

def generate_text(text, max_attempts=100):
    generator = pipeline("text-generation", model="./fine_tuned_gpt2", tokenizer="./fine_tuned_gpt2", device="cuda")
    
    prompt = f"Q: {text}\nA:"
    
    for _ in range(max_attempts):
        output = generator(
        prompt, # Prompt
        max_length=100,                  # Max tokens to generate
        truncation=True,
        num_return_sequences=1,         # Number of outputs
        temperature=1.5,                # Randomness (lower = more deterministic)
        top_k=50,                       # Consider top 50 tokens
        top_p=0.95,                     # Nucleus sampling
        do_sample=True                  # Enable sampling
        )
        output = output[0]["generated_text"][len(prompt):]
        if len(output) != 0:
            print(output)
            break
        
generate_text(text)

 To solve the question would imply that the solution does not seem obvious enough
