In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class GearDataset(Dataset):
    def __init__(self, data):
        self.data = data.readlines()  # Read lines from the file

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx].strip()  # Strip newline characters

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '<pad>'})  # Set padding token
config = GPT2Config.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2", config=config)

In [4]:
# Define hyperparameters
epochs = 10
batch_size = 4
learning_rate = 5e-5

# Initialize AMP
scaler = GradScaler()

In [5]:
def fine_tune_gpt2_mixed_precision(dataset, tokenizer, model, epochs, batch_size, learning_rate):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.train()

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        total_loss = 0

        for batch in tqdm(dataloader, desc=f"Epoch {epoch + 1}/{epochs}"):
            inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)
            labels = inputs["input_ids"].clone()
            labels[labels == tokenizer.pad_token_id] = -100  # Mask padding tokens

            optimizer.zero_grad()

            with autocast():
                outputs = model(**inputs, labels=labels)
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(dataloader)}")

In [6]:
# Load the dataset
pdf_path = '/home/rithwik/paper/dataset/shigley_cleaned_text.txt'
text = open(pdf_path, 'r')
gear_dataset = GearDataset(text)

# Train with mixed precision
fine_tune_gpt2_mixed_precision(gear_dataset, tokenizer, model, epochs, batch_size, learning_rate)


Epoch 1/10:   0%|          | 0/1 [00:03<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacity of 1.95 GiB of which 18.62 MiB is free. Including non-PyTorch memory, this process has 1.93 GiB memory in use. Of the allocated memory 1.86 GiB is allocated by PyTorch, and 23.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)