In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [None]:
def load_dataset(file_path, tokenizer):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=128)
    return dataset

In [None]:
def load_data_collator(tokenizer):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False)
    return data_collator

In [None]:
def train():
    # Load pre-trained model and tokenizer
    model_name = "gpt2"
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)

    # Add padding token to tokenizer
    tokenizer.pad_token = tokenizer.eos_token

    # Load dataset
    train_dataset = load_dataset("/content/space_exploration_train.txt", tokenizer)
    val_dataset = load_dataset("/content/space_exploration_val.txt", tokenizer)

    # Load data collator
    data_collator = load_data_collator(tokenizer)

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        eval_steps=400,
        save_steps=800,
        warmup_steps=500,
        prediction_loss_only=True,
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    # Start training
    trainer.train()

    # Save the fine-tuned model and tokenizer
    model_path = "./results/fine_tuned_gpt2"
    model.save_pretrained(model_path)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    tokenizer.save_pretrained(model_path)

In [None]:
train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Step,Training Loss


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

def generate_text(model, tokenizer, prompt, max_length=100):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    # Generate text
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        top_k=50,
        top_p=0.95,
        temperature=0.7
    )

    # Decode the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Load your fine-tuned model and tokenizer
model_path = "./results/fine_tuned_gpt2"
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# Set the model to evaluation mode
model.eval()

# Generate text
prompt = "Astronauts are exploring Mars. They"
generated_text = generate_text(model, tokenizer, prompt)
print(f"Generated text:\n{generated_text}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated text:
Astronauts are exploring Mars. They encounter intense gravity but manage to overcome it. During their mission, they discover signs of microbial life. This finding could revolutionize our understanding of the universe.
Astrophysauts have colonized Jupiter. Their mission is terraforming Saturn. It encounters toxic atmosphere but manages to manage it successfully. After several years, the mission concludes that it is habitable. The finding may revolutionise our knowledge of Mars and could lead to new understanding about the cosmos.


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

def evaluate_model(model, tokenizer, test_file):
    model.eval()
    total_loss = 0
    total_tokens = 0

    with open(test_file, 'r') as f:
        for line in f:
            inputs = tokenizer(line, return_tensors='pt', truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs, labels=inputs['input_ids'])
                loss = outputs.loss
                total_loss += loss.item() * inputs['input_ids'].size(1)
                total_tokens += inputs['input_ids'].size(1)

    perplexity = torch.exp(torch.tensor(total_loss / total_tokens))
    return perplexity.item()

# Load your fine-tuned model and tokenizer
model_path = "./results/fine_tuned_gpt2"
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# Evaluate on test set
test_file = "space_exploration_test.txt"
perplexity = evaluate_model(model, tokenizer, test_file)
print(f"Perplexity: {perplexity}")

Perplexity: 1.2726107835769653


In [None]:
!zip -r /content/results.zip /content/results

  adding: content/results/ (stored 0%)
  adding: content/results/fine_tuned_gpt2/ (stored 0%)
  adding: content/results/fine_tuned_gpt2/merges.txt (deflated 53%)
  adding: content/results/fine_tuned_gpt2/tokenizer_config.json (deflated 54%)
  adding: content/results/fine_tuned_gpt2/config.json (deflated 51%)
  adding: content/results/fine_tuned_gpt2/vocab.json (deflated 68%)
  adding: content/results/fine_tuned_gpt2/generation_config.json (deflated 24%)
  adding: content/results/fine_tuned_gpt2/special_tokens_map.json (deflated 74%)
  adding: content/results/fine_tuned_gpt2/model.safetensors (deflated 7%)
  adding: content/results/checkpoint-231/ (stored 0%)
  adding: content/results/checkpoint-231/rng_state.pth (deflated 25%)
  adding: content/results/checkpoint-231/scheduler.pt (deflated 56%)
  adding: content/results/checkpoint-231/optimizer.pt (deflated 8%)
  adding: content/results/checkpoint-231/trainer_state.json (deflated 54%)
  adding: content/results/checkpoint-231/config.jso