In [1]:
# Imports & Setup

In [2]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW
from torch.utils.data import DataLoader
import joblib
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [4]:
# Step 2: Load dataset
from data.chat_dataset import ChatDataset  # make sure this file exists

dataset = joblib.load("data/processed_dataset.pkl")
print(f"Total samples: {len(dataset)}")

train_dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

Total samples: 51


In [5]:
# Step 3: Load model & tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("models/gpt2_tokenizer/")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))
model.to(device)



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [6]:
# Step 4: Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)



In [7]:
# Step 5: Training loop
epochs = 3  # change as needed

model.train()
for epoch in range(epochs):
    loop = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}", ncols=100)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        loop.set_postfix(loss=loss.item())

Epoch 1/3: 100%|█████████████████████████████████████████| 26/26 [01:03<00:00,  2.42s/it, loss=1.27]
Epoch 2/3: 100%|██████████████████████████████████████████| 26/26 [00:52<00:00,  2.03s/it, loss=1.5]
Epoch 3/3: 100%|█████████████████████████████████████████| 26/26 [00:49<00:00,  1.91s/it, loss=1.17]


In [8]:
# Step 6: Save trained model

model.save_pretrained("models/gpt2_trained/")
tokenizer.save_pretrained("models/gpt2_trained/")

print("✅ Training complete and model saved!")

✅ Training complete and model saved!
