In [None]:
%pip install torch transformers datasets

Note: you may need to restart the kernel to use updated packages.


In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import DataLoader
from datasets import load_dataset
from torch.optim import AdamW
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler

In [None]:
# Load the dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

model = GPT2LMHeadModel.from_pretrained("distilgpt2")
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")

# Assign a padding token if not already present
tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as the pad_token

# Tokenize the input text and set up labels
def tokenize_function(examples):
    tokenized_inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()  # Set the labels as input_ids
    return tokenized_inputs

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Convert dataset to PyTorch tensors
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Create DataLoader
train_dataset = tokenized_datasets["train"]

def collate_fn(batch):
    return tokenizer.pad(batch, padding=True, return_tensors="pt")

train_dataloader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    num_workers=1,  # Disable multiprocessing to debug
    pin_memory=True
)

# Load the model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Move the model to GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Set the model to training mode
model.train()

# Training loop
epochs = 3
scaler = GradScaler()  # Initialize the scaler for mixed precision

# Set gradient accumulation steps (adjust to simulate larger batches)
accumulation_steps = 4  # Simulates larger batch size

for epoch in range(epochs):
    loop = tqdm(train_dataloader, leave=True)

    optimizer.zero_grad()  # Reset the gradients before starting

    for step, batch in enumerate(loop):
        inputs = {key: val.to(device) for key, val in batch.items()}

        with torch.cuda.amp.autocast():  # Enable mixed precision
            outputs = model(**inputs)
            loss = outputs.loss / accumulation_steps  # Scale loss for accumulation

        scaler.scale(loss).backward()  # Backpropagate loss

        if (step + 1) % accumulation_steps == 0:
            scaler.step(optimizer)  # Update weights
            scaler.update()
            optimizer.zero_grad()  # Reset gradients

print("Training complete!")


  scaler = GradScaler()  # Initialize the scaler for mixed precision
  with torch.cuda.amp.autocast():  # Enable mixed precision
  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 2295/2295 [47:35<00:00,  1.24s/it]
100%|██████████| 2295/2295 [42:19<00:00,  1.11s/it]
100%|██████████| 2295/2295 [42:27<00:00,  1.11s/it]

Training complete!





In [None]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
# Load the model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Set pad token to eos token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Move model to the correct device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Prepare input text and move to device
input_text = "How are you? Tell me about Tesla Motors!"
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)

# Create attention mask to differentiate between padding and actual data
attention_mask = torch.ones(inputs.shape, device=device)

# Generate text with repetition penalty
with torch.no_grad():
    outputs = model.generate(
        inputs,
        attention_mask=attention_mask,
        max_length=150,  # Increased length to avoid truncation
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        temperature=0.8,  # Slightly adjusted temperature
        top_k=50,         # Limit next token choices to top-k
        top_p=0.9,        # Use nucleus sampling
        repetition_penalty=1.2  # Penalty to discourage repetition
    )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(generated_text)



How are you? Tell me about Tesla Motors!
I'm a big fan of the company and I've been working on it for years. It's one of my favorite cars ever made, but there is something special that makes this car so unique to us: The way we build our vehicles in such an innovative manner allows them not only to be built with high quality materials (like aluminum), they can also have their own custom parts available from suppliers like BMW or Mercedes-Benz as well – all without having to worry too much over what will happen when these components go into production…and then get shipped out by truck every year."
