In [8]:
import os
import torch
from torch.utils.data import Dataset
from transformers import (
    GPT2Config, GPT2LMHeadModel,
    Trainer, TrainingArguments,
    DataCollatorForLanguageModeling, GPT2TokenizerFast
)
from tqdm import tqdm

# === Step 1: Setup Paths ===
chunk_dir = "token_chunks"
model_output_dir = "gpt2_custom_model"
block_size = 512
chunks_to_train = 20  # Only train on first 20 chunks

# === Step 2: Load Tokenizer ===
tokenizer = GPT2TokenizerFast(
    vocab_file="tokenizer_gpt2_custom/vocab.json",
    merges_file="tokenizer_gpt2_custom/merges.txt",
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>"
)

# === Step 3: Define Dataset ===
class ChunkDataset(Dataset):
    def __init__(self, token_ids, block_size=512):
        self.examples = [token_ids[i:i+block_size] for i in range(0, len(token_ids) - block_size, block_size)]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        x = torch.tensor(self.examples[idx])
        return {"input_ids": x, "labels": x.clone()}

# === Step 4: Define GPT2 Model from Scratch ===
config = GPT2Config(
    vocab_size=30000,
    n_positions=512,
    n_ctx=512,
    n_embd=256,
    n_layer=4,
    n_head=4,
)
model = GPT2LMHeadModel(config)

# === Step 5: Training Arguments ===
training_args = TrainingArguments(
    output_dir=model_output_dir,
    overwrite_output_dir=True,
    per_device_train_batch_size=1,
    num_train_epochs=1,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    prediction_loss_only=True,
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(model=model, args=training_args, data_collator=data_collator)

# === Step 6: Load and Train on First 20 Chunks ===
chunk_files = sorted([f for f in os.listdir(chunk_dir) if f.endswith(".pt")])[:chunks_to_train]
print(f"🧠 Training on {len(chunk_files)} chunks...")

all_tokens = []

for i, file in enumerate(tqdm(chunk_files, desc="📦 Loading Chunks")):
    file_path = os.path.join(chunk_dir, file)
    tokens = torch.load(file_path)
    all_tokens.extend(tokens)

dataset = ChunkDataset(all_tokens, block_size)
trainer.train_dataset = dataset

print(f"\n🚀 Starting training with {len(dataset)} samples...")
trainer.train()

# === Step 7: Save the Model ===
trainer.save_model(model_output_dir)
print(f"✅ Training complete! Model saved to: {model_output_dir}")


🧠 Training on 20 chunks...


📦 Loading Chunks: 100%|██████████| 20/20 [08:03<00:00, 24.20s/it]



🚀 Starting training with 78124 samples...


Step,Training Loss
100,9.4422
200,8.4779
300,7.9131
400,7.607
500,7.5761
600,7.5775
700,7.4394
800,7.4267
900,7.469
1000,7.3473


KeyboardInterrupt: 

In [2]:
import os
import torch
from torch.utils.data import Dataset
from transformers import (
    GPT2Config, GPT2LMHeadModel,
    Trainer, TrainingArguments,
    DataCollatorForLanguageModeling,
    GPT2TokenizerFast
)
from tqdm import tqdm

# === Step 1: Paths and Settings ===
chunk_dir = "token_chunks"
model_output_dir = "gpt2_custom_model"
checkpoint_path = os.path.join(model_output_dir, "checkpoint-56000")
tokenizer_path = "tokenizer_gpt2_custom"
block_size = 512
chunks_to_use = 20

# === Step 2: Load Tokenizer ===
tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_path)

# === Step 3: Dataset Loader ===
class ChunkDataset(Dataset):
    def __init__(self, token_ids, block_size=512):
        self.examples = [token_ids[i:i+block_size] for i in range(0, len(token_ids) - block_size, block_size)]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        x = torch.tensor(self.examples[idx])
        return {"input_ids": x, "labels": x.clone()}

# === Step 4: Load Model Config & Model ===
config_path = os.path.join(model_output_dir, "config.json")
config = GPT2Config.from_json_file(config_path)
model = GPT2LMHeadModel(config)

# === Step 5: Training Arguments ===
training_args = TrainingArguments(
    output_dir=model_output_dir,
    overwrite_output_dir=False,
    per_device_train_batch_size=1,
    num_train_epochs=1,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    prediction_loss_only=True,
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(model=model, args=training_args, data_collator=data_collator)

# === Step 6: Train on 20 Chunks from Checkpoint ===
chunk_files = sorted([f for f in os.listdir(chunk_dir) if f.endswith(".pt")])[:chunks_to_use]
print(f"🧠 Resuming training on {len(chunk_files)} chunks from checkpoint-56000...")

resume_done = False

for i, file in enumerate(chunk_files):
    file_path = os.path.join(chunk_dir, file)
    print(f"\n📦 Loading chunk {i+1}/{len(chunk_files)}: {file}")
    token_ids = torch.load(file_path)
    dataset = ChunkDataset(token_ids.tolist(), block_size)
    trainer.train_dataset = dataset

    if not resume_done and os.path.exists(checkpoint_path):
        print(f"🔁 Resuming from checkpoint: {checkpoint_path}")
        trainer.train(resume_from_checkpoint=checkpoint_path)
        resume_done = True
    else:
        print(f"🚀 Training on chunk {i+1} with {len(dataset)} samples")
        trainer.train()

    print(f"✅ Finished training chunk {i+1}")

# === Step 7: Save Final Model ===
trainer.save_model(model_output_dir)
print(f"\n✅ Training complete! Model saved to: {model_output_dir}")



There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


🧠 Resuming training on 20 chunks from checkpoint-56000...

📦 Loading chunk 1/20: chunk_0.pt
🔁 Resuming from checkpoint: gpt2_custom_model\checkpoint-56000


Step,Training Loss


✅ Finished training chunk 1

📦 Loading chunk 2/20: chunk_1.pt
🚀 Training on chunk 2 with 3906 samples


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,5.8317
200,5.8558
300,5.8075
400,5.778
500,5.7929
600,5.7933
700,5.8606
800,5.7963
900,5.8016
1000,5.8184


✅ Finished training chunk 2

📦 Loading chunk 3/20: chunk_10.pt
🚀 Training on chunk 3 with 3906 samples


Step,Training Loss
100,5.8767
200,5.7956
300,5.7968
400,5.8732
500,5.7273
600,5.7998
700,5.816
800,5.8546
900,5.768
1000,5.789


✅ Finished training chunk 3

📦 Loading chunk 4/20: chunk_100.pt
🚀 Training on chunk 4 with 3906 samples


Step,Training Loss
100,5.9087
200,5.8086
300,5.8164
400,5.8349
500,5.8659
600,5.8087
700,5.8268
800,5.7997
900,5.7512
1000,5.7489


✅ Finished training chunk 4

📦 Loading chunk 5/20: chunk_1000.pt
🚀 Training on chunk 5 with 3906 samples


Step,Training Loss
100,5.7447
200,5.8272
300,5.7759
400,5.8308
500,5.763
600,5.8021
700,5.7557
800,5.8031
900,5.7813
1000,5.7398


✅ Finished training chunk 5

📦 Loading chunk 6/20: chunk_1001.pt
🚀 Training on chunk 6 with 3906 samples


Step,Training Loss
100,5.7249
200,5.7931
300,5.7964
400,5.7621
500,5.7913
600,5.7079
700,5.7853
800,5.7767
900,5.7789
1000,5.6953


✅ Finished training chunk 6

📦 Loading chunk 7/20: chunk_1002.pt
🚀 Training on chunk 7 with 3906 samples


Step,Training Loss
100,5.7787
200,5.6772
300,5.7466
400,5.7261
500,5.6989
600,5.7729
700,5.7415
800,5.6761
900,5.73
1000,5.716


✅ Finished training chunk 7

📦 Loading chunk 8/20: chunk_1003.pt
🚀 Training on chunk 8 with 3906 samples


Step,Training Loss
100,5.7674
200,5.6956
300,5.6815
400,5.7163
500,5.7694
600,5.6803
700,5.7636
800,5.6625
900,5.6838
1000,5.7585


✅ Finished training chunk 8

📦 Loading chunk 9/20: chunk_1004.pt
🚀 Training on chunk 9 with 3906 samples


Step,Training Loss
100,5.6523
200,5.6547
300,5.7945
400,5.6412
500,5.661
600,5.7349
700,5.6868
800,5.6862
900,5.6659
1000,5.6107


✅ Finished training chunk 9

📦 Loading chunk 10/20: chunk_1005.pt
🚀 Training on chunk 10 with 3906 samples


Step,Training Loss
100,5.5829
200,5.7265
300,5.5907
400,5.6719
500,5.6811
600,5.6829
700,5.7723
800,5.7154
900,5.6577
1000,5.6797


✅ Finished training chunk 10

📦 Loading chunk 11/20: chunk_1006.pt
🚀 Training on chunk 11 with 3906 samples


Step,Training Loss
100,5.65
200,5.6012
300,5.6404
400,5.5677
500,5.5766
600,5.669
700,5.6351
800,5.7001
900,5.7235
1000,5.6798


✅ Finished training chunk 11

📦 Loading chunk 12/20: chunk_1007.pt
🚀 Training on chunk 12 with 3906 samples


Step,Training Loss
100,5.5913
200,5.6684
300,5.6757
400,5.6816
500,5.6374
600,5.6379
700,5.6124
800,5.5917
900,5.5884
1000,5.6254


✅ Finished training chunk 12

📦 Loading chunk 13/20: chunk_1008.pt
🚀 Training on chunk 13 with 3906 samples


Step,Training Loss
100,5.7167
200,5.6138
300,5.6003
400,5.6208
500,5.5959
600,5.613
700,5.626
800,5.5887
900,5.5541
1000,5.577


✅ Finished training chunk 13

📦 Loading chunk 14/20: chunk_1009.pt
🚀 Training on chunk 14 with 3906 samples


Step,Training Loss
100,5.6002
200,5.6319
300,5.6503
400,5.6485
500,5.6532
600,5.6317
700,5.6041
800,5.6181
900,5.6524
1000,5.6382


✅ Finished training chunk 14

📦 Loading chunk 15/20: chunk_101.pt
🚀 Training on chunk 15 with 3906 samples


Step,Training Loss
100,5.6625
200,5.5659
300,5.6495
400,5.6427
500,5.5932
600,5.6303
700,5.5698
800,5.6363
900,5.5708
1000,5.6678


✅ Finished training chunk 15

📦 Loading chunk 16/20: chunk_1010.pt
🚀 Training on chunk 16 with 3906 samples


Step,Training Loss
100,5.5981
200,5.6917
300,5.5904
400,5.6388
500,5.6335
600,5.5545
700,5.564
800,5.6163
900,5.5915
1000,5.5735


✅ Finished training chunk 16

📦 Loading chunk 17/20: chunk_1011.pt
🚀 Training on chunk 17 with 3906 samples


Step,Training Loss
100,5.5564
200,5.5212
300,5.5429
400,5.5128
500,5.5806
600,5.6325
700,5.5257
800,5.5693
900,5.5077
1000,5.5462


✅ Finished training chunk 17

📦 Loading chunk 18/20: chunk_1012.pt
🚀 Training on chunk 18 with 3906 samples


Step,Training Loss
100,5.5763
200,5.5706
300,5.6219
400,5.4348
500,5.5171
600,5.5005
700,5.5603
800,5.5688
900,5.5956
1000,5.5062


✅ Finished training chunk 18

📦 Loading chunk 19/20: chunk_1013.pt
🚀 Training on chunk 19 with 3906 samples


Step,Training Loss
100,5.5722
200,5.5923
300,5.5562
400,5.5394
500,5.5517
600,5.5046
700,5.5308
800,5.5312
900,5.525
1000,5.5081


✅ Finished training chunk 19

📦 Loading chunk 20/20: chunk_1014.pt
🚀 Training on chunk 20 with 3906 samples


Step,Training Loss
100,5.5799
200,5.5541
300,5.5127
400,5.5405
500,5.5672
600,5.5287
700,5.5137
800,5.498
900,5.5777
1000,5.5557


✅ Finished training chunk 20

✅ Training complete! Model saved to: gpt2_custom_model
