In [1]:
import joblib
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


#### Define the ChatDataset class (must be defined before loading dataset)

In [2]:
class ChatDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, max_length=64):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data.iloc[idx]['input']
        target_text = self.data.iloc[idx]['target']

        input_encoding = self.tokenizer(
            input_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )

        target_encoding = self.tokenizer(
            target_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }

#### Load dataset using joblib (ensure dataset was saved with this class)

In [3]:
dataset = joblib.load("data/processed_dataset.pkl")

#### Load GPT-2 model and tokenizer

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained("models/gpt2_tokenizer/")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))



Embedding(50257, 768)

In [5]:
###

In [6]:
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define DataLoader
train_dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Set optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define training parameters
epochs = 3

#### Training Loop

In [None]:
for epoch in range(epochs):
    model.train()
    epoch_loss = 0

    # Loop over batches
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}", ncols=100):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # Accumulate loss for the epoch
        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} - Loss: {epoch_loss/len(train_dataloader)}")

Epoch 1/3:  51%|█████████████████████████▍                        | 240/472 [36:21<20:22,  5.27s/it]

In [None]:
# Save the fine-tuned model
model.save_pretrained("models/fine_tuned_gpt2/")
tokenizer.save_pretrained("models/fine_tuned_gpt2/")