In [1]:
# ============================================
# 🌟 GPT-2 Text Generation Project
# Prodigy Infotech Internship Task (Final Version)
# By: [Your Name]
# ============================================

# 🧩 STEP 1 — Install all dependencies (fixed version)
!pip install -U pip setuptools wheel
!pip install tokenizers==0.13.3 --only-binary=:all:
!pip install transformers==4.31.0 datasets torch --quiet

# 🧩 STEP 2 — Import all libraries
import os
os.environ["WANDB_DISABLED"] = "true"  # disable Weights & Biases pop-up

from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import torch

# 🧠 STEP 3 — Load GPT-2 model & tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Fix padding token issue
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

# 📝 STEP 4 — Create your small dataset
# You can replace these lines with your own text data
text_data = [
    "Artificial intelligence is changing the world of technology.",
    "Machine learning helps computers learn from experience.",
    "Natural language processing enables communication with computers.",
    "Data science combines math and coding to solve real problems.",
    "AI will make automation smarter and more efficient in the future."
]

dataset = Dataset.from_dict({"text": text_data})

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=64)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
train_dataset = tokenized_datasets  # using all data for training

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# ⚙️ STEP 5 — Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_steps=5,
)

# 🏋️‍♂️ STEP 6 — Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

print("🚀 Training started... Please wait.")
trainer.train()

# ✨ STEP 7 — Generate text
prompt = "Artificial intelligence"
inputs = tokenizer(prompt, return_tensors="pt")

outputs = model.generate(
    **inputs,
    max_length=80,
    temperature=0.7,
    top_p=0.9,
    do_sample=True
)

print("\n🧠 Generated Text:\n")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# 💾 STEP 8 — Save model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

print("\n✅ Training complete! Model saved in 'fine_tuned_model' folder.")
print("🎉 You can now use it to generate custom text!")


[31mERROR: Ignored the following yanked versions: 0.20.4[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement tokenizers==0.13.3 (from versions: 0.14.0, 0.14.1, 0.15.0, 0.15.1, 0.15.2, 0.19.0, 0.19.1, 0.20.0, 0.20.1rc1, 0.20.1, 0.20.2, 0.20.3rc0, 0.20.3, 0.20.4rc0, 0.21.0rc0, 0.21.0, 0.21.1rc0, 0.21.1, 0.21.2rc0, 0.21.2, 0.21.4, 0.22.0, 0.22.1rc0, 0.22.1)[0m[31m
[0m[31mERROR: No matching distribution found for tokenizers==0.13.3[0m[31m
[0m  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mBuilding wheel for tokenizers [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


🚀 Training started... Please wait.


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
5,3.6366


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



🧠 Generated Text:

Artificial intelligence and robotics is already in its infancy, and that is not likely to change.

"It will make a lot of difference in the future. It's not going to be perfect. It's going to be better than what we've done," said Rene Fournier, an analyst at the University of Michigan's Hoover Institution in Palo Alto, California.

The goal

✅ Training complete! Model saved in 'fine_tuned_model' folder.
🎉 You can now use it to generate custom text!


In [None]:
from google.colab import drive
drive.mount('/content/drive')