In [1]:
import torch
import os
from torch.utils.data import Dataset, DataLoader, IterableDataset
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling, get_scheduler

# Path for the huggingface cache
os.environ["HF_HOME"] = "D:/huggingface_cache"
os.environ["HF_DATASET_CACHE"] = "D:/huggingface_cache/datasets"
os.environ["TRANSFORMERS_CACHE"] = "D:/huggingface_cache/models"

# Path for the PyTorch cache
os.environ["TORCH_HOME"] = "D:/torch_cache"

In [2]:
# loading gpt2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

max_steps = 50000  
learning_rate = 5e-4

cuda


In [3]:
# Loading OpenWebText dataset
dataset = load_dataset("Skylion007/openwebtext", split="train", streaming=True, trust_remote_code=True)

In [4]:
# dataset loading class
class OpenWebTextDataset(IterableDataset):
    def __init__(self, dataset, tokenizer, max_length=512):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    # def __len__(self):
    #     return len(self.dataset)

    # def __getitem__(self, idx):
    #     text = self.dataset[idx]["text"]
    #     encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")

    #     return {
    #         "input_ids": encoding["input_ids"].squeeze(0),
    #         "attention_mask": encoding["attention_mask"].squeeze(0),
    #     }
    def __iter__(self):
        for example in self.dataset:
            encoding = self.tokenizer(example["text"], truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
            yield {
                "input_ids": encoding["input_ids"].squeeze(0),
                "attention_mask": encoding["attention_mask"].squeeze(0),
            }

In [5]:
# Wrap dataset in PyTorch Dataset class
custom_dataset = OpenWebTextDataset(dataset, tokenizer)
train_dataloader = DataLoader(custom_dataset, batch_size=4, num_workers=4, pin_memory=True)

In [6]:
# estimate loss function
def estimate_loss(model, eval_dataset):
    model.eval()
    loss = 0.0
    num_batches = 0

    for batch in eval_dataset:
        inputs = torch.tensor(batch["input_ids"]).unsqueeze(0).to(model.device)
        labels = inputs.clone()
        
        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            loss += outputs.loss.item()
            num_batches += 1

    model.train()
    return loss / num_batches if num_batches > 0 else float("inf")

In [7]:
# tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

In [8]:
# Load GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# training arguments
training_args = TrainingArguments(
    output_dir="F:\\GPT2_Tuning\\gpt2_training_checkpoints",
    eval_strategy="no",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="F:\\GPT2_Tuning\\logs",
    logging_steps=1000,
    save_steps=5000,
    warmup_steps=1000,
    save_total_limit=3,
    max_steps=max_steps,
    fp16=True,
    optim="adamw_torch",
    report_to="none",
)

In [9]:
# custom learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=1000,
    num_training_steps=max_steps
)

In [10]:
# data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)
# trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=custom_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    optimizers=(optimizer, lr_scheduler)
)

# Start training and estimate loss after each epoch
for epoch in range(training_args.num_train_epochs):
    print(f"Epoch {epoch+1}/{training_args.num_train_epochs}")
    trainer.train()
    
    # Estimate loss
    eval_loss = estimate_loss(model, tokenized_datasets["train"])
    print(f"Estimated Loss after Epoch {epoch+1}: {eval_loss:.4f}")

  trainer = Trainer(
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch 1/3


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1000,3.2559
2000,3.4155
3000,3.4024
4000,3.3871
5000,3.3953
6000,3.3784


KeyboardInterrupt: 

In [1]:
checkpoint_path = "F:/GPT2_Tuning/gpt2_training_checkpoints/"

# Loading fine-tuned model
model = GPT2LMHeadModel.from_pretrained(checkpoint_path)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set model to evaluation mode
model.eval()

def generate_response(prompt, max_length=500):
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))
    tokens = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    input_ids = tokens["input_ids"]
    attention_mask = tokens["attention_mask"]
    
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pad_token_id=tokenizer.eos_token_id,
            max_length=max_length,
            temperature=0.3,
            top_k=50,
            top_p=0.95,
            repetition_penalty=1.8,
            do_sample=True
        )

    return tokenizer.decode(output[0], skip_special_tokens=True)

# driver code
while True:
    prompt = input("Enter your question (Type 'exit' to quit): ")
    
    if prompt.lower() == "exit":
        print("Chatbot session ended.")
        break
    
    response = generate_response(prompt)
    print("Chatbot Response:", response)

NameError: name 'GPT2LMHeadModel' is not defined