# Assignment-6-part-A Implement NanoGPT



In [1]:
# Clone the nanoGPT repository
!git clone https://github.com/karpathy/nanoGPT.git
%cd nanoGPT

# Install required packages
!pip install numpy transformers datasets tiktoken wandb tqdm

# Upgrade PyTorch to version 2.0.1 with CUDA 11.8 support
!pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2+cu118 --index-url https://download.pytorch.org/whl/cu118



Cloning into 'nanoGPT'...
remote: Enumerating objects: 682, done.[K
remote: Total 682 (delta 0), reused 0 (delta 0), pack-reused 682 (from 1)[K
Receiving objects: 100% (682/682), 952.47 KiB | 2.83 MiB/s, done.
Resolving deltas: 100% (385/385), done.
/content/nanoGPT
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downlo

In [2]:
import torch
print(torch.__version__)

2.0.1+cu118


In [3]:
# Download Shakespeare's works
!wget https://www.gutenberg.org/files/100/100-0.txt -O shakespeare.txt

# Create a directory for the data
!mkdir data/shakespeare

# Move the text file to the data directory
!mv shakespeare.txt data/shakespeare/input.txt


--2024-12-01 02:32:14--  https://www.gutenberg.org/files/100/100-0.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5618733 (5.4M) [text/plain]
Saving to: ‘shakespeare.txt’


2024-12-01 02:32:15 (9.75 MB/s) - ‘shakespeare.txt’ saved [5618733/5618733]

mkdir: cannot create directory ‘data/shakespeare’: File exists


In [4]:
# Create the prepare.py script
%%writefile data/shakespeare/prepare.py

import os
import pickle
import numpy as np

# Define input and output paths
input_file_path = 'data/shakespeare/input.txt'
train_output_file_path = 'data/shakespeare/train.bin'
val_output_file_path = 'data/shakespeare/val.bin'

# Read the input text
with open(input_file_path, 'r', encoding='utf-8') as f:
    data = f.read()

# Get all unique characters
chars = sorted(list(set(data)))
vocab_size = len(chars)
print(f"Unique characters: {vocab_size}")

# Create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }

# Encode the entire text data
data_size = len(data)
print(f"Data has {data_size} characters.")

# Convert data to integers
encoded_data = np.array([stoi[c] for c in data], dtype=np.uint16)

# Split data into training and validation sets
n = int(0.9 * len(encoded_data))
train_data = encoded_data[:n]
val_data = encoded_data[n:]

# Save the data to .bin files
train_data.tofile(train_output_file_path)
val_data.tofile(val_output_file_path)

# Save the mapping for decoding
meta = {
    'vocab_size': vocab_size,
    'itos': itos,
    'stoi': stoi,
}
with open('data/shakespeare/meta.pkl', 'wb') as f:
    pickle.dump(meta, f)


Overwriting data/shakespeare/prepare.py


In [5]:
# Run the data preparation script
!python data/shakespeare/prepare.py

Unique characters: 100
Data has 5359439 characters.


In [6]:
# Create the shakespeare.py config file
%%writefile config/shakespeare.py

# Configuration for training nanoGPT on Shakespeare's works

out_dir = 'out-shakespeare'
eval_interval = 500
eval_iters = 200
log_interval = 100

always_save_checkpoint = False

wandb_log = False  # Set to True if using Weights & Biases
wandb_project = 'shakespeare'
wandb_run_name = 'mini-gpt'

dataset = 'shakespeare'
batch_size = 64
block_size = 256  # Context length

# Model configuration
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3
max_iters = 500
lr_decay_iters = 500  # Make equal to max_iters
min_lr = 1e-4
beta2 = 0.99

warmup_iters = 100

Writing config/shakespeare.py


In [7]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    print(f"CUDA is available. GPU device name: {gpu_name}")
else:
    print("CUDA is not available. No GPU detected.")

CUDA is available. GPU device name: Tesla T4


## Start Training

In [8]:
# Start training
!python train.py config/shakespeare.py

Overriding config with config/shakespeare.py:

# Configuration for training nanoGPT on Shakespeare's works

out_dir = 'out-shakespeare'
eval_interval = 500
eval_iters = 200
log_interval = 100

always_save_checkpoint = False

wandb_log = False  # Set to True if using Weights & Biases
wandb_project = 'shakespeare'
wandb_run_name = 'mini-gpt'

dataset = 'shakespeare'
batch_size = 64
block_size = 256  # Context length

# Model configuration
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3
max_iters = 500
lr_decay_iters = 500  # Make equal to max_iters
min_lr = 1e-4
beta2 = 0.99

warmup_iters = 100

tokens per iteration will be: 655,360
found vocab_size = 100 (inside data/shakespeare/meta.pkl)
Initializing a new model from scratch
number of parameters: 10.66M
num decayed parameter tensors: 26, with 10,753,536 parameters
num non-decayed parameter tensors: 13, with 4,992 parameters
using fused AdamW: True
compiling the model... (takes a ~minute)
step 0: train loss 4.611