In [1]:
import torch
import os
import evaluate
from torch.utils.data import Dataset, DataLoader, IterableDataset
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling, get_scheduler
from huggingface_hub import login

login("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")

# huggingface cache
os.environ["HF_HUB_CACHE"] = "D:/huggingface_cache"
os.environ["HF_DATASET_CACHE"] = "D:/huggingface_cache/datasets"
os.environ["TRANSFORMERS_CACHE"] = "D:/huggingface_cache/models"

# PyTorch cache
os.environ["TORCH_HOME"] = "D:/torch_cache"

In [2]:
# loading gpt2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

max_steps = 50000
learning_rate = 1e-4

cuda


In [3]:
# Loading wikitext dataset
dataset = load_dataset("lfsm/multimodal_wiki", split="train", streaming=True, trust_remote_code=True)

In [5]:
# dataset loading class
class Dataset(IterableDataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __iter__(self):
        for example in self.dataset:
            encoding = self.tokenizer(example["text"], truncation=True, return_tensors="pt")
            yield {
                "input_ids": encoding["input_ids"].squeeze(0),
                "attention_mask": encoding["attention_mask"].squeeze(0),
            }

NameError: name 'train_stream' is not defined

In [5]:
# Wrap dataset in PyTorch Dataset class
custom_dataset = Dataset(dataset, tokenizer)
train_dataloader = DataLoader(custom_dataset, batch_size=4, num_workers=4, pin_memory=True)

In [6]:
def estimate_loss_and_accuracy(model, eval_dataset) -> tuple[float, float]:

    # Estimate the loss and accuracy of a model on an evaluation dataset.

    model.eval()  # evaluation mode
    total_loss = 0.0
    total_correct = 0
    total_tokens = 0

    # compute loss with gradient calculation
    with torch.no_grad():
        for batch in eval_dataloader:
            inputs = batch["input_ids"].to(model.device)
            labels = inputs.clone()

            outputs = model(inputs, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item() * inputs.size(0)

            predictions = torch.argmax(logits, dim=-1)
            mask = labels != -100  # assumes masked labels
            correct = ((predictions == labels) & mask).sum().item()
            total_correct += correct
            total_tokens += mask.sum().item()
            
            num_batches += 1

    model.train()  # training mode
    
    # Calculate average metrics
    avg_loss = total_loss / num_batches if num_batches > 0 else float("inf")
    accuracy = total_correct / total_tokens if total_tokens > 0 else 0.0
    
    return avg_loss, accuracy

In [7]:
# tokenize dataset
def tokenize_batch(batch):
    return tokenizer(batch["text"], truncation=True)

tokenized_dataset = dataset.map(tokenize_batch, batched=True, remove_columns=["text"])

# Define data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [8]:
# Load GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# training arguments
training_args = TrainingArguments(
    output_dir="F:/GPT2_Tuning/training_output",
    eval_strategy="no",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="F:/GPT2_Tuning/logs",
    logging_steps=1000,
    save_steps=2000,
    warmup_steps=1000,
    save_total_limit=3,
    max_steps=max_steps,
    fp16=True,
    optim="adamw_torch",
    report_to="none",
)

In [9]:
# custom learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=1000,
    num_training_steps=max_steps
)

# Load metric
accuracy_metric = evaluate.load("accuracy")

# Convert logits → predicted token IDs
def preprocess_logits_for_metrics(logits, labels):
    return torch.argmax(logits, dim=-1)

# Compute accuracy
def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    if predictions.shape != labels.shape:
        labels = labels[:, :predictions.shape[1]]
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [10]:
# initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, lr_scheduler)
)

# Start training and estimate loss after each epoch
for epoch in range(training_args.num_train_epochs):
    print(f"Epoch {epoch+1}/{training_args.num_train_epochs}")
    trainer.train()
    
    # Estimate loss
    eval_loss = estimate_loss_and_accuracy(model, tokenized_datasets["train"])
    print(f"Estimated Loss after Epoch {epoch+1}: {eval_loss:.4f}")

NameError: name 'train_dataset' is not defined

In [None]:
# Perplexity from eval loss
eval_results = trainer.evaluate()
eval_loss = eval_results["eval_loss"]
perplexity = math.exp(eval_loss)
print(f"Perplexity: {perplexity:.2f}")

# Generate text from a prompt
prompt = "In a distant future, humanity has"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

output_ids = model.generate(
    inputs["input_ids"],
    max_length=100,
    num_return_sequences=1,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.8
)

generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("\nGenerated Text:\n")
print(generated_text)

In [15]:
checkpoint_path = "F:/GPT2_Tuning/gpt2_training_checkpoints/checkpoint-50000"

# Load the fine-tuned model
model = GPT2LMHeadModel.from_pretrained(checkpoint_path)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set model to evaluation mode
model.eval()

def generate_response(prompt, max_length=100):
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))
    tokens = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    input_ids = tokens["input_ids"]
    attention_mask = tokens["attention_mask"]
    
    with torch.no_grad():  # Disable gradient calculation for inference
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pad_token_id=tokenizer.eos_token_id,
            max_length=max_length,
            temperature=0.3,
            top_k=50,         # Limits to top-k words
            top_p=0.95,       # Nucleus sampling
            repetition_penalty=1.8,  # Prevents repetitive outputs
            do_sample=True
        )

    return tokenizer.decode(output[0], skip_special_tokens=True)

# Test the model
while True:
    prompt = input("Enter your question (Type 'exit' to quit): ")
    
    if prompt.lower() == "exit":
        print("Chatbot session ended.")
        break
    
    response = generate_response(prompt)
    print("Chatbot Response:", response)

Enter your question (Type 'exit' to quit):  hello


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Chatbot Response: hello. I’m a big fan of the old school, classic and modern style.. My favorite is that you can wear it with jeans or t-shirts for your summer party!. And my favourite part about this dress? It has an ankle strap to hold everything in place so no need getting tangled up on stuff! The straps are made from super soft cotton fabric which makes them perfect as long sleeves. They also come complete w/stitched seams so they don't get messy when worn.


Enter your question (Type 'exit' to quit):  what dress are you talking about?


Chatbot Response: what dress are you talking about?. I’ve been a fan of the classic, but now it is time to change. This week we have our first look at this gorgeous new collection from The Dormouse.. It was inspired by my favorite book “The Little Mermaid: A Memoirs Of My Life and Years With Love (1944-1949) – which has become one among many books that inspire me today! You can see more images in these gallery below!. Here


Enter your question (Type 'exit' to quit):  what dress are you going to wear?


Chatbot Response: what dress are you going to wear?. Dress up your wedding with a matching outfit for the perfect look.. This is an easy way of showing off that style and make it more memorable. You can choose from different styles, such as black or white dresses which will be worn with all kinds accessories including flowers in front!. The best part about this piece though – it's made outof cotton so if there’s anything else on hand then we don't have any choice but do our own research


Enter your question (Type 'exit' to quit):  exit


Chatbot session ended.
