In [1]:
import warnings
warnings.filterwarnings("ignore")
from transformers import logging
logging.set_verbosity_error()

# Preprocessing of data

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer

# loading dataset and tokenizer
from datasets import load_dataset

wiki = load_dataset(
    "wikitext", "wikitext-2-raw-v1",
    split={
        "train": "train[:15000]",
        "validation": "validation[:3200]",
        "test": "test[:4000]"
    }
)

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token    #padding
wiki

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 15000
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3200
    })
    test: Dataset({
        features: ['text'],
        num_rows: 4000
    })
})

In [3]:
# tokenizing and splitting data
def preprocess(example):
    tokens = tokenizer(example["text"], truncation=True, padding="max_length", max_length=256)
    tokens["labels"] = tokens["input_ids"]
    return tokens

wiki_tokenized = wiki.map(preprocess, batched=True, remove_columns=["text"])
wiki_tokenized.set_format("torch")

train_dataset = wiki_tokenized["train"]
eval_dataset = wiki_tokenized["validation"]
test_dataset = wiki_tokenized["test"]

print(train_dataset, eval_dataset, test_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 15000
}) Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3200
}) Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 4000
})


# Warm up time check

In [4]:
# import time
# from transformers import GPT2LMHeadModel, TrainingArguments, EarlyStoppingCallback, Trainer

# model = GPT2LMHeadModel.from_pretrained("gpt2")

# # warmup time 

# training_args = TrainingArguments(
#     output_dir="./gpt2-finetuned",
#     per_device_train_batch_size=4,
#     # gradient_accumulation_steps=2, 
#     num_train_epochs=1,
#     max_steps=20,                      # limit training to 20 steps for estimate
#     logging_steps=5,
#     fp16=True,
#     disable_tqdm=False
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     tokenizer=tokenizer,
# )

# start = time.time()
# trainer.train()
# end = time.time()

# time_per_step = (end - start) / 20
# total_steps = int(len(train_dataset) / 4)  # total steps in 1 epoch
# estimated_total = time_per_step * total_steps

# print(f"\nEstimated total training time: {estimated_total / 60:.2f} minutes")

print('Estimated time ~150 min. \nCommented and restarted kernel to save CUDA memory \n(may continue without commenintg for GPU: min 8GB, RTX 3050)')

Estimated time ~150 min. 
Commented and restarted kernel to save CUDA memory 
(may continue without commenintg for GPU: min 8GB, RTX 3050)


# Memory handling before training

In [4]:
import torch

# CUDA capability checking
if torch.cuda.is_available():
    device = torch.device("cuda")
    gpu_name = torch.cuda.get_device_name(device)
    compute_capability = torch.cuda.get_device_capability(device)

    print(f"GPU: {gpu_name}")
    print(f"Compute Capability: {compute_capability}")
    

    if compute_capability[0] >= 7:      
        print("FP16 is supported on this GPU.")
    else:
        print("FP16 is not supported.")
else:
    print("CUDA not available.")

GPU: NVIDIA GeForce RTX 2050
Compute Capability: (8, 6)
FP16 is supported on this GPU.


In [5]:
# Check GPU memory after cleanup

del wiki_tokenized                # deleting not required variables to save memory

import gc
gc.collect()
import torch
torch.cuda.empty_cache()

free_mem, total_mem = torch.cuda.mem_get_info()
free_mem_mb = free_mem / (1024 ** 2) 
print(f"Free (available) GPU memory: {free_mem_mb:.2f} MB")

Free (available) GPU memory: 2834.45 MB


# Actual model train

In [None]:
import torch
import time
from transformers import (
    GPT2LMHeadModel,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    TrainerCallback,
)

# Load model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Custom callback to log free CUDA memory
class CUDAMemoryLoggerCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if torch.cuda.is_available():
            torch.cuda.empty_cache()                                # CUDA cache clear comes with trade-off > more frequent memory call up increased learning time
            free_mem, total_mem = torch.cuda.mem_get_info()
            free_mb = free_mem / (1024 ** 2)
            print(f"[Step {state.global_step}] Free CUDA memory: {free_mb:.2f} MB")

# Training parametres
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    # gradient_accumulation_steps=2,       #commented to reduce learning time > more preferred over clearing cache
    num_train_epochs=1,
    evaluation_strategy="epoch",           # evaluate every epoch
    logging_strategy="steps",
    logging_steps=600,                    #checking details after 600 steps  
    learning_rate=5e-5,
    warmup_steps=200,                      # default: lr_scheduler = linear,  ~10% of total steps (1 epoch 15k/4 ~3500 steps)
    weight_decay=0.01,                     # default optimizer AdamW, wt decay ~ regularization
    save_strategy="epoch",
    save_total_limit=2,                     # best two model will be saved after all epochs, bad ones automatically gets deleted to save space when new better version comes
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,               # eval_loss for less it should be less thus 'False'
    fp16=True,                             # 16 bit, mixed precision
    prediction_loss_only=False,
    disable_tqdm=False                     # show tqdm progress bar
)

# Final Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,          
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=2),        # early stopping, though uselss for 1 epoch
        CUDAMemoryLoggerCallback()                               #custom memory log callback
    ]
)

# Train
trainer.train()

 16%|█▌        | 600/3750 [14:41<1:22:53,  1.58s/it]

[Step 600] Free CUDA memory: 678.45 MB
{'loss': 1.2485, 'learning_rate': 4.447887323943662e-05, 'epoch': 0.16}


 32%|███▏      | 1200/3750 [29:37<1:05:55,  1.55s/it]

[Step 1200] Free CUDA memory: 678.45 MB
{'loss': 0.8523, 'learning_rate': 3.602816901408451e-05, 'epoch': 0.32}


 48%|████▊     | 1800/3750 [40:20<35:30,  1.09s/it]  

[Step 1800] Free CUDA memory: 678.45 MB
{'loss': 0.8582, 'learning_rate': 2.7577464788732394e-05, 'epoch': 0.48}


 64%|██████▍   | 2400/3750 [50:29<21:42,  1.04it/s]

[Step 2400] Free CUDA memory: 678.45 MB
{'loss': 0.7768, 'learning_rate': 1.9126760563380284e-05, 'epoch': 0.64}


 80%|████████  | 3000/3750 [1:01:48<12:35,  1.01s/it] 

[Step 3000] Free CUDA memory: 678.45 MB
{'loss': 0.825, 'learning_rate': 1.067605633802817e-05, 'epoch': 0.8}


 96%|█████████▌| 3600/3750 [1:12:06<02:40,  1.07s/it]

[Step 3600] Free CUDA memory: 678.45 MB
{'loss': 0.7731, 'learning_rate': 2.2253521126760562e-06, 'epoch': 0.96}


                                                     
100%|██████████| 3750/3750 [1:17:42<00:00,  1.03s/it]

[Step 3750] Free CUDA memory: 678.45 MB
{'eval_loss': 0.8296507000923157, 'eval_runtime': 181.2336, 'eval_samples_per_second': 17.657, 'eval_steps_per_second': 4.414, 'epoch': 1.0}


100%|██████████| 3750/3750 [1:17:52<00:00,  1.25s/it]

[Step 3750] Free CUDA memory: 678.45 MB
{'train_runtime': 4672.4649, 'train_samples_per_second': 3.21, 'train_steps_per_second': 0.803, 'train_loss': 0.8849034159342448, 'epoch': 1.0}





TrainOutput(global_step=3750, training_loss=0.8849034159342448, metrics={'train_runtime': 4672.4649, 'train_samples_per_second': 3.21, 'train_steps_per_second': 0.803, 'train_loss': 0.8849034159342448, 'epoch': 1.0})

# Evaluation

In [19]:
import torch
import numpy as np
import math
from tqdm import tqdm
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# fine tuned model loading
model_path = "./gpt2-finetuned/checkpoint-3750"
model = GPT2LMHeadModel.from_pretrained(model_path).to("cuda")

model.eval()

def evaluate(model, dataset, top_k=5):
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=4)
    losses = []
    topk_correct = 0
    total_preds = 0

    for batch in tqdm(dataloader):
        input_ids = batch["input_ids"].to("cuda")
        labels = batch["labels"].to("cuda")

        with torch.no_grad():
            outputs = model(input_ids=input_ids, labels=labels)
            shift_logits = outputs.logits[:, :-1, :].contiguous()
            shift_labels = labels[:, 1:].contiguous()

            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id, reduction="none")
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            losses.append(loss.mean().item())

            # Top-k accuracy
            topk = torch.topk(shift_logits, k=top_k, dim=-1).indices
            match = (topk == shift_labels.unsqueeze(-1)).any(-1)
            topk_correct += match.sum().item()
            total_preds += match.numel()

    perplexity = math.exp(np.mean(losses))
    topk_acc = topk_correct / total_preds
    return perplexity, topk_acc

# evaluation process
perplexity, topk_acc = evaluate(model, test_dataset)
print(f"\nPerplexity: {perplexity:.2f}")
print(f"Top-{5} Accuracy: {topk_acc*100:.2f}%")


100%|██████████| 1000/1000 [56:44<00:00,  3.40s/it] 


Perplexity: 2.26
Top-5 Accuracy: 90.53%





# Deployment

In [8]:
import gradio as gr

# fine tuned model 
model_path = "./gpt2-finetuned/checkpoint-3750"
model = GPT2LMHeadModel.from_pretrained(model_path)

tokenizer = AutoTokenizer.from_pretrained("gpt2")

model.eval()
model.to('cuda')

# Prediction function
def generate_next_word(prompt, max_length=30):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_length=inputs.input_ids.shape[1] + max_length,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.9,
        pad_token_id=tokenizer.eos_token_id,
    )
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated

# Gradio Interface
gr.Interface(
    fn=generate_next_word,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter a prompt...", label="Input Prompt"),
        gr.Slider(5, 100, value=30, step=1, label="Max Output Length")
    ],
    outputs="text",
    title="GPT-2 Next Word Prediction",
    description="Enter a prompt and see how the model completes it.",
    theme="default"
).launch()


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




# LSTM comparision

In [7]:
wiki    #already loaded

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 15000
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3200
    })
    test: Dataset({
        features: ['text'],
        num_rows: 4000
    })
})

In [8]:
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2TokenizerFast
from tqdm import tqdm
import time

# using GPT2 tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "<pad>"})

# encoding texts to match GPT inputs
def tokenize_and_pad(texts, max_len=256):
    all_ids = []
    for t in texts:
        if t.strip():
            ids = tokenizer(t, truncation=True, padding="max_length", max_length=max_len, return_tensors=None)["input_ids"]
            all_ids.extend(ids)
    return all_ids

#loading dtaset 
train_data = tokenize_and_pad(wiki["train"]["text"])
eval_data = tokenize_and_pad(wiki["validation"]["text"])
test_data  = tokenize_and_pad(wiki["test"]["text"])

VOCAB_SIZE = tokenizer.vocab_size + 1  # account for added pad_token

# Dataset class
class WordDataset(Dataset):
    def __init__(self, data, seq_len):
        self.data = data
        self.seq_len = seq_len

    def __len__(self):
        return len(self.data) - self.seq_len

    def __getitem__(self, idx):
        return (torch.tensor(self.data[idx:idx+self.seq_len]),
                torch.tensor(self.data[idx+1:idx+self.seq_len+1]))

# hyperparameters
SEQ_LEN = 16
BATCH_SIZE = 16
EMBED_DIM = 128
HIDDEN_DIM = 128
EPOCHS = 1
PATIENCE = 1
TOP_K = 5

# Dataloaders
dataset = WordDataset(train_data, SEQ_LEN)
eval_dataset = WordDataset(eval_data, SEQ_LEN)
test_dataset = WordDataset(test_data, SEQ_LEN)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# LSTM architecture
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = self.fc(out)
        return out

# Training setup
model = LSTMModel(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM).to("cuda")
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Early stopping
best_eval_loss = float('inf')
patience_counter = 0

In [9]:
from tqdm import tqdm

print("\nStarting training...")
overall_start_time = time.time()

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    epoch_start_time = time.time()

    model.train()
    total_loss = 0
    progress_bar = tqdm(enumerate(dataloader), total=len(dataloader), desc="Training", ncols=100)
    for step, (x, y) in progress_bar:
        x, y = x.to("cuda"), y.to("cuda")
        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits.view(-1, VOCAB_SIZE), y.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

        if step % 60000 == 0:
            print(f"Epoch {epoch+1}, Step {step}, Loss: {loss.item():.4f}")

    avg_train_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}, Avg Training Loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for x, y in eval_loader:
            x, y = x.to("cuda"), y.to("cuda")
            logits = model(x)
            loss = criterion(logits.view(-1, VOCAB_SIZE), y.view(-1))
            val_loss += loss.item()
    avg_val_loss = val_loss / len(eval_loader)
    print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss:.4f}")

    epoch_duration = time.time() - epoch_start_time
    print(f"Epoch {epoch+1} completed in {epoch_duration:.2f} seconds")

    # Early stopping check
    if avg_val_loss < best_eval_loss:
        best_eval_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_model.pt")
    else:
        patience_counter += 1
        if patience_counter >= PATIENCE:
            print("Early stopping triggered.")
            break

print(f"\nTraining completed in {(time.time() - overall_start_time) / 60:.2f} minutes.")



Starting training...

Epoch 1/1


Training:   0%|                                    | 4/154847 [00:06<51:54:15,  1.21s/it, loss=9.65]

Epoch 1, Step 0, Loss: 10.8989


Training:  39%|█████████████▌                     | 60005/154847 [22:22<33:53, 46.64it/s, loss=1.25]

Epoch 1, Step 60000, Loss: 1.1385


Training:  78%|████████████████████████▊       | 120008/154847 [1:23:17<12:47, 45.41it/s, loss=1.35]

Epoch 1, Step 120000, Loss: 1.6403


Training: 100%|████████████████████████████████| 154847/154847 [1:36:03<00:00, 26.87it/s, loss=1.18]


Epoch 1, Avg Training Loss: 1.6883
Epoch 1, Validation Loss: 2.6409
Epoch 1 completed in 5930.72 seconds

Training completed in 98.85 minutes.


In [None]:
# Load best model for evaluation

# model.load_state_dict(torch.load("best_model.pt")) 1 epoch only used

model.eval()

# Test evaluation
print("\nEvaluating on test set...")
with torch.no_grad():
    total_loss = 0
    topk_correct = 0
    total_preds = 0
    for x, y in tqdm(test_loader):
        x, y = x.to("cuda"), y.to("cuda")
        logits = model(x)
        loss = criterion(logits.view(-1, VOCAB_SIZE), y.view(-1))
        total_loss += loss.item()

        topk = torch.topk(logits, k=TOP_K, dim=-1).indices
        match = (topk == y.unsqueeze(-1)).any(-1)
        topk_correct += match.sum().item()
        total_preds += match.numel()

    avg_loss = total_loss / len(test_loader)
    perplexity = np.exp(avg_loss)
    topk_acc = topk_correct / total_preds
    print(f"Evaluation Completed:\nPerplexity: {perplexity:.2f}\nTop-{TOP_K} Accuracy: {topk_acc * 100:.2f}%")


Evaluating on test set...


100%|██████████| 42319/42319 [05:42<00:00, 123.50it/s]

Evaluation Completed:
Perplexity: 13.81
Top-5 Accuracy: 75.35%





In [12]:
print('GPT model: \nPerplexity: 2.26\nTop-5 Accuracy: 90.53%')

GPT model: 
Perplexity: 2.26
Top-5 Accuracy: 90.53%
