<a href="https://colab.research.google.com/github/skiraware/BabyGPT/blob/main/babygpt_refined.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/karpathy/nanoGPT
%cd nanoGPT
!pip install torch numpy transformers datasets tiktoken wandb tqdm

Cloning into 'nanoGPT'...
remote: Enumerating objects: 686, done.[K
remote: Total 686 (delta 0), reused 0 (delta 0), pack-reused 686 (from 1)[K
Receiving objects: 100% (686/686), 954.03 KiB | 9.54 MiB/s, done.
Resolving deltas: 100% (387/387), done.
/content/nanoGPT
Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
 

In [2]:
!python data/shakespeare_char/prepare.py


length of dataset in characters: 1,115,394
all the unique characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size: 65
train has 1,003,854 tokens
val has 111,540 tokens


In [3]:
!python train.py config/train_shakespeare_char.py  --compile=False --always_save_checkpoint=True --eval_interval=5000


Overriding config with config/train_shakespeare_char.py:
# train a miniature character-level shakespeare model
# good for debugging and playing on macbooks and such

out_dir = 'out-shakespeare-char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this small dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'shakespeare-char'
wandb_run_name = 'mini-gpt'

dataset = 'shakespeare_char'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 5000
lr_decay_iters = 5000 # make equal to max_iters usually
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of 

In [4]:
!python sample.py --out_dir=out-shakespeare-char

Overriding: out_dir = out-shakespeare-char
number of parameters: 10.65M
Loading meta from data/shakespeare_char/meta.pkl...


ANGELO:
And come, my lord,
Straight and you not.

ISABELLA:
It is bar at him that he that did
So meet his purpose to him.

ANGELO:

ISABELLA:
When she did but say 'thwack in this maid:
Go the before her my brother live to me.

ANGELO:
I am sorry that she hath left under you.

ISABELLA:
I would thought it would the instrument of mine
Condemned that would she have done to the sour of her
After you love the princes.

ISABELLA:
Now, as it were a dish of sound that is mine,
I desire to move my count
---------------

Men punish, I shall find them at once again.

Second Servingman:
You will be here a little: she is confessor, she can learn
his mother; which shall be beholdered in Carthack
be considered.

First Servingman:
And so did I.

Third Servingman:
My voice is grown made by the warlike sweet sorrow.

Third Servingman:
What, is he?

Third Servingman:
Here hath non

In [5]:
# Step 3: Model Architecture Exploration (Revised for n_head=2,3,5,7 with n_embd=210, max_iters=1000)
# Run in Google Colab with T4 GPU

import os
import matplotlib.pyplot as plt

# Step 3.1: Train models with Layers=7, Heads=[2, 3, 5, 7], n_embd=210, max_iters=1000
heads = [2, 3, 5, 7]
for h in heads:
    print(f"Training with Layers=7, Heads={h}")
    cmd = (f"python train.py config/train_shakespeare_char.py "
           f"--n_layer=7 --n_head={h} --n_embd=210 --compile=False "
           f"--out_dir=out-shakespeare-l7-h{h} --max_iters=1000 --batch_size=8 "
           f"> output_l7_h{h}.txt 2> error_l7_h{h}.txt")
    os.system(cmd)

# Debug: Check output and error files
for h in heads:
    # Check output file
    output_file = f"output_l7_h{h}.txt"
    if os.path.exists(output_file):
        with open(output_file, 'r') as f:
            content = f.read().strip()
            if content:
                print(f"{output_file} has content (first 100 chars): {content[:100]}")
                last_lines = content.split('\n')[-10:]
                print(f"Last 10 lines of {output_file}:\n", '\n'.join(last_lines))
            else:
                print(f"{output_file} is empty")
    else:
        print(f"{output_file} does not exist")

    # Check error file
    error_file = f"error_l7_h{h}.txt"
    if os.path.exists(error_file):
        with open(error_file, 'r') as f:
            error_content = f.read().strip()
            if error_content:
                print(f"{error_file} has content:\n{error_content}")
            else:
                print(f"{error_file} is empty")
    else:
        print(f"{error_file} does not exist")

# Step 3.2: Extract losses at iteration 1000
def extract_losses(filename, target_iter=1000):
    try:
        with open(filename, 'r') as f:
            for line in f:
                if f'step {target_iter}' in line:
                    parts = line.split()
                    train_loss = float(parts[4].strip(','))  # e.g., "1.2345,"
                    val_loss = float(parts[7])               # e.g., "1.3456"
                    return train_loss, val_loss
        print(f"No step {target_iter} found in {filename}")
        return None, None
    except FileNotFoundError:
        print(f"File {filename} not found")
        return None, None
    except Exception as e:
        print(f"Error parsing {filename}: {e}")
        return None, None

# Collect losses
train_losses = []
val_losses = []

for h in heads:  # Corrected from 'heads26 heads'
    t_loss, v_loss = extract_losses(f"output_l7_h{h}.txt", target_iter=1000)
    if t_loss is not None and v_loss is not None:
        train_losses.append(t_loss)
        val_losses.append(v_loss)
    else:
        train_losses.append(float('inf'))
        val_losses.append(float('inf'))

print("Heads:", heads)
print("Train Losses:", train_losses)
print("Val Losses:", val_losses)

# Create figures directory and plot
os.makedirs('figures', exist_ok=True)

plt.plot(heads, train_losses, marker='o')
plt.xlabel('Number of Heads')
plt.ylabel('Training Loss at Iteration 1000')
plt.title('Training Loss vs. Number of Heads (Layers = 7)')
plt.grid(True)
plt.savefig('figures/loss_vs_heads.png')
plt.close()

# Step 3.3: Report lowest validation loss and settings
min_val_loss = min(val_losses)
best_head = heads[val_losses.index(min_val_loss)] if min_val_loss != float('inf') else None

Training with Layers=7, Heads=2
Training with Layers=7, Heads=3
Training with Layers=7, Heads=5
Training with Layers=7, Heads=7
output_l7_h2.txt has content (first 100 chars): Overriding config with config/train_shakespeare_char.py:
# train a miniature character-level shakesp
Last 10 lines of output_l7_h2.txt:
 iter 930: loss 2.2269, time 34.31ms, mfu 0.53%
iter 940: loss 2.1792, time 34.10ms, mfu 0.53%
iter 950: loss 2.2274, time 34.40ms, mfu 0.53%
iter 960: loss 2.2008, time 34.52ms, mfu 0.53%
iter 970: loss 2.2091, time 34.53ms, mfu 0.52%
iter 980: loss 2.1761, time 34.33ms, mfu 0.52%
iter 990: loss 2.2278, time 34.62ms, mfu 0.52%
step 1000: train loss 2.1299, val loss 2.1881
saving checkpoint to out-shakespeare-l7-h2
iter 1000: loss 2.2563, time 5258.34ms, mfu 0.47%
error_l7_h2.txt has content:
  scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
output_l7_h3.txt has content (first 100 chars): Overriding config with config/train_shakespeare_char.py:
# train a miniatur

In [7]:
import os
import shutil

# Create data/code_generation directory by copying data/shakespeare_char
os.makedirs('data/code_generation', exist_ok=True)
shutil.copy('data/shakespeare_char/prepare.py', 'data/code_generation/prepare.py')

# Clone The-Young-Programmer/C-CPP-Programming repository
print("Cloning The-Young-Programmer/C-CPP-Programming repository...")
os.system('git clone https://github.com/The-Young-Programmer/C-CPP-Programming.git')

# Aggregate all .c, .cpp, and .h files into input.txt
output_file = 'data/code_generation/input.txt'
with open(output_file, 'w', encoding='utf-8') as outfile:
    for root, dirs, files in os.walk('C-CPP-Programming'):
        for file in files:
            if file.endswith(('.c', '.cpp', '.h')):  # Include C, C++, and header files
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, 'r', encoding='utf-8') as infile:
                        outfile.write(infile.read())
                        outfile.write('\n\n')  # Separate files
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")

# Verify file size (rough estimate: ~4 chars per token)
file_size = os.path.getsize(output_file)
estimated_tokens = file_size // 4
print(f"Size of input.txt: {file_size} bytes (~{estimated_tokens} tokens)")

# Duplicate content if token count is below 100,000
if estimated_tokens < 100000:
    print("Duplicating input.txt content to meet token requirement...")
    with open(output_file, 'r', encoding='utf-8') as f:
        content = f.read()
    with open(output_file, 'w', encoding='utf-8') as f:
        for _ in range((100000 // estimated_tokens) + 1):
            f.write(content)
            f.write('\n\n')
    new_size = os.path.getsize(output_file)
    estimated_tokens = new_size // 4
    print(f"New size of input.txt: {new_size} bytes (~{estimated_tokens} tokens)")

# Run prepare.py to process the dataset
os.chdir('data/code_generation')
print("Running prepare.py...")
os.system('python prepare.py')
os.chdir('../..')

# Read vocab_size and token count from meta.pkl
import numpy as np
import pickle
meta_path = 'data/code_generation/meta.pkl'
if os.path.exists(meta_path):
    with open(meta_path, 'rb') as f:
        meta = pickle.load(f)
    vocab_size = meta['vocab_size']
    train_bin = np.memmap('data/code_generation/train.bin', dtype=np.uint16, mode='r')
    token_count = len(train_bin)
    print(f"Vocab size: {vocab_size}")
    print(f"Token count: {token_count}")
else:
    print("meta.pkl not found. Check prepare.py output.")

Cloning The-Young-Programmer/C-CPP-Programming repository...
Size of input.txt: 132431 bytes (~33107 tokens)
Duplicating input.txt content to meet token requirement...
New size of input.txt: 529732 bytes (~132433 tokens)
Running prepare.py...
Vocab size: 111
Token count: 476445


In [8]:
import os
import time

# Train the model
print("Training BabyGPT on C/C++ code generation dataset...")
start_time = time.time()
cmd = (f"python train.py config/train_code_generation.py "
       f"--compile=False --out_dir=out-code-generation "
       f"> output_code_generation.txt 2> error_code_generation.txt")
os.system(cmd)
elapsed_time = time.time() - start_time
print(f"Training completed in {elapsed_time:.2f} seconds")

# Check output file
output_file = 'output_code_generation.txt'
if os.path.exists(output_file):
    with open(output_file, 'r') as f:
        content = f.read().strip()
        if content:
            print(f"{output_file} has content (first 100 chars): {content[:100]}")
            last_lines = content.split('\n')[-10:]
            print(f"Last 10 lines of {output_file}:\n", '\n'.join(last_lines))
        else:
            print(f"{output_file} is empty")
else:
    print(f"{output_file} does not exist")

# Check error file
error_file = 'error_code_generation.txt'
if os.path.exists(error_file):
    with open(error_file, 'r') as f:
        error_content = f.read().strip()
        if error_content:
            print(f"{error_file} has content:\n{error_content}")
        else:
            print(f"{error_file} is empty")
else:
    print(f"{error_file} does not exist")

Training BabyGPT on C/C++ code generation dataset...
Training completed in 402.17 seconds
output_code_generation.txt has content (first 100 chars): Overriding config with config/train_code_generation.py:
# config/train_code_generation.py
out_dir = 
Last 10 lines of output_code_generation.txt:
 iter 430: loss 0.6991, time 748.35ms, mfu 0.05%
iter 440: loss 1.8041, time 741.91ms, mfu 0.05%
iter 450: loss 1.5769, time 746.20ms, mfu 0.05%
iter 460: loss 1.2985, time 754.58ms, mfu 0.05%
iter 470: loss 1.2101, time 741.13ms, mfu 0.05%
iter 480: loss 0.6765, time 719.46ms, mfu 0.05%
iter 490: loss 1.6352, time 729.79ms, mfu 0.05%
step 500: train loss 1.0083, val loss 0.9693
saving checkpoint to out-code-generation
iter 500: loss 0.7077, time 3690.31ms, mfu 0.05%
error_code_generation.txt has content:
  scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))


In [9]:
import os

# Generate samples
print("Generating samples from C/C++ code generation model...")
cmd = (f"python sample.py --out_dir=out-code-generation --start='def ' "
       f"> samples_code_generation.txt 2> error_samples_code_generation.txt")
os.system(cmd)

# Read the first 20 lines of the generated samples
sample_file = 'samples_code_generation.txt'
if os.path.exists(sample_file):
    with open(sample_file, 'r') as f:
        samples = f.readlines()
        samples = [line.strip() for line in samples if line.strip()]
        first_20_lines = samples[:20]
        print("First 20 lines of generated samples:")
        for line in first_20_lines:
            print(line)
else:
    first_20_lines = ["No samples generated. Check error_samples_code_generation.txt."]
    print(f"{sample_file} does not exist")

# Check error file
error_sample_file = 'error_samples_code_generation.txt'
if os.path.exists(error_sample_file):
    with open(error_sample_file, 'r') as f:
        error_content = f.read().strip()
        if error_content:
            print(f"{error_sample_file} has content:\n{error_content}")
        else:
            print(f"{error_sample_file} is empty")
else:
    print(f"{error_sample_file} does not exist")

Generating samples from C/C++ code generation model...
First 20 lines of generated samples:
Overriding: out_dir = out-code-generation
Overriding: start = def
number of parameters: 3.73M
Loading meta from data/code_generation/meta.pkl...
def boardace
case '\n';
return ch;
}
// colord candition to din tition
// allt_scording();
{
cout << "\t\t3.Restetext\t";
gotoxy(row, col);
cout << "\t\t Presssss Ad\n^-----------------------------------------------------------------------------------------------------------------------
<< "\t\t            "
<< "\t\t\t\t"
< "\t\t\t\t\t                 "
<< '\n'
<<"\n\t\t  "
---------------
error_samples_code_generation.txt is empty
