In [4]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import mmap
import random
import pickle
import matplotlib.pyplot as plt
import torch.optim as optim
import itertools

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

# Hyperparameter grid

hyperparameter_grid = {
    'batch_size': [16, 32, 64],
    'block_size': [64, 128, 256],
    'max_iters': [200],
    'learning_rate': [1e-3, 1e-4, 1e-5],
    'n_embd': [256, 384, 512],
    'n_head': [4, 8, 12],
    'n_layer': [6, 10, 12],
}
dropout= 0.2
eval_iters= 50
# Function to create model with given hyperparameters
def create_model(vocab_size, n_embd, n_head, n_layer):
    class Head(nn.Module):
        def __init__(self, head_size):
            super().__init__()
            self.key = nn.Linear(n_embd, head_size, bias=False)
            self.query = nn.Linear(n_embd, head_size, bias=False)
            self.value = nn.Linear(n_embd, head_size, bias=False)
            self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

            self.dropout = nn.Dropout(dropout)

        def forward(self, x):
            B, T, C = x.shape
            k = self.key(x)
            q = self.query(x)
            wei = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5
            wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
            wei = F.softmax(wei, dim=-1)
            wei = self.dropout(wei)
            v = self.value(x)
            out = wei @ v
            return out

    class MultiHeadAttention(nn.Module):
        def __init__(self, num_heads, head_size):
            super().__init__()
            self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
            self.proj = nn.Linear(head_size * num_heads, n_embd)
            self.dropout = nn.Dropout(dropout)

        def forward(self, x):
            out = torch.cat([h(x) for h in self.heads], dim=-1)
            out = self.dropout(self.proj(out))
            return out

    class FeedForward(nn.Module):
        def __init__(self, n_embd):
            super().__init__()
            self.net = nn.Sequential(
                nn.Linear(n_embd, 4 * n_embd),
                nn.ReLU(),
                nn.Linear(4 * n_embd, n_embd),
                nn.Dropout(dropout),
            )

        def forward(self, x):
            return self.net(x)

    class Block(nn.Module):
        def __init__(self, n_embd, n_head):
            super().__init__()
            head_size = n_embd // n_head
            self.sa = MultiHeadAttention(n_head, head_size)
            self.ffws = FeedForward(n_embd)
            self.ln1 = nn.LayerNorm(n_embd)
            self.ln2 = nn.LayerNorm(n_embd)

        def forward(self, x):
            y = self.sa(x)
            x = self.ln1(x + y)
            y = self.ffws(x)
            x = self.ln2(x + y)
            return x

    class GptLanguageModel(nn.Module):
        def __init__(self, vocab_size):
            super().__init__()
            self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
            self.position_embedding_table = nn.Embedding(block_size, n_embd)
            self.block = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
            self.ln_f = nn.LayerNorm(n_embd)  # final layer norm
            self.lm_head = nn.Linear(n_embd, vocab_size)

            self.apply(self._init_weights)

        def _init_weights(self, module):
            if isinstance(module, nn.Linear):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
                if module.bias is not None:
                    torch.nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

        def forward(self, index, targets=None):
            B, T = index.shape

            tok_emb = self.token_embedding_table(index)
            pos_emb = self.position_embedding_table(torch.arange(T, device=device))

            x = tok_emb + pos_emb  # b,t,c
            x = self.block(x)
            x = self.ln_f(x)
            logits = self.lm_head(x)

            if targets is None:
                loss = None
            else:
                B, T, C = logits.shape
                logits = logits.view(B * T, C)
                targets = targets.view(B * T)
                loss = F.cross_entropy(logits, targets)
            return logits, loss

        def generate(self, index, max_new_tokens):
            for _ in range(max_new_tokens):
                logits, _ = self.forward(index)
                logits = logits[:, -1, :]
                probs = F.softmax(logits, dim=-1)
                index_next = torch.multinomial(probs, num_samples=1)
                index = torch.cat((index, index_next), dim=1)
            return index

    return GptLanguageModel(vocab_size).to(device)

# Define data functions
def get_random_chunk(split):
    filename = "train_split.txt" if split == 'train' else "val_split.txt"
    with open(filename, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            file_size = len(mm)
            start_pos = random.randint(0, file_size - block_size * batch_size)

            mm.seek(start_pos)
            block = mm.read(block_size * batch_size - 1)
            decoded_block = block.decode('utf-8', errors='ignore').replace('\r', '')
            data = torch.tensor(encode(decoded_block), dtype=torch.long)
            
    return data

def get_batch(split):
    data = get_random_chunk(split)
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix]).to(device)
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix]).to(device)
    return x, y

@torch.no_grad()
def estimate_loss(model, eval_iters):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    return out

# Load vocabulary
chars = ""
with open('vocab.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    chars = sorted(set(text))
vocab_size = len(chars)
print(f"Vocabulary size: {vocab_size}")

string_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_string = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

# Initialize results
best_loss = float('inf')
best_params = {}

# Grid search over hyperparameters
for params in itertools.product(*hyperparameter_grid.values()):
    batch_size, block_size, max_iters, learning_rate, n_embd, n_head, n_layer = params
    
    print(f"\nTesting combination: {params}")
    
    # Create model with current hyperparameters
    model = create_model(vocab_size, n_embd, n_head, n_layer)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    train_losses = []
    val_losses = []

    for iter in range(max_iters):
        if iter % eval_iters == 0:
            losses = estimate_loss(model, eval_iters)
            train_losses.append(losses['train'])
            val_losses.append(losses['val'])
            print(f"step: {iter}, train loss: {losses['train']}, val loss: {losses['val']}")

        xb, yb = get_batch('train')
        logits, loss = model.forward(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
    
    print(f"Final loss: {loss.item()}")
    
    # Check if the current model is the best one
    if loss.item() < best_loss:
        best_loss = loss.item()
        best_params = {
            'batch_size': batch_size,
            'block_size': block_size,
            'max_iters': max_iters,
            'learning_rate': learning_rate,
            'n_embd': n_embd,
            'n_head': n_head,
            'n_layer': n_layer
        }

# Save best parameters
with open('best_params.pkl', 'wb') as f:
    pickle.dump(best_params, f)

print(f"Best parameters: {best_params} with loss: {best_loss}")

# Optionally plot losses
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Losses')
plt.show()

cuda
Vocabulary size: 32171

Testing combination: (16, 64, 200, 0.001, 256, 4, 6)
step: 0, train loss: 10.4306001663208, val loss: 10.430209159851074
step: 50, train loss: 3.2726123332977295, val loss: 3.2111847400665283
step: 100, train loss: 2.97087025642395, val loss: 3.057042121887207
step: 150, train loss: 2.839160203933716, val loss: 2.829014539718628
Final loss: 2.6228508949279785

Testing combination: (16, 64, 200, 0.001, 256, 4, 10)
step: 0, train loss: 10.437643051147461, val loss: 10.440773963928223
step: 50, train loss: 2.910841464996338, val loss: 2.8277034759521484
step: 100, train loss: 2.800921678543091, val loss: 2.7220914363861084
step: 150, train loss: 2.691829204559326, val loss: 2.7112538814544678
Final loss: 2.7108919620513916

Testing combination: (16, 64, 200, 0.001, 256, 4, 12)
step: 0, train loss: 10.391509056091309, val loss: 10.391407012939453
step: 50, train loss: 2.9284470081329346, val loss: 2.826960802078247
step: 100, train loss: 2.6976218223571777, val

KeyboardInterrupt: 

In [4]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import mmap
import random
import pickle
import matplotlib.pyplot as plt
import torch.optim as optim
import itertools

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

# Hyperparameter grid
hyperparameter_grid = {
    'batch_size': [16, 32, 64],
    'block_size': [64, 128, 256],
    'max_iters': [200],
    'learning_rate': [1e-3, 1e-4, 1e-5],
    'n_embd': [256, 384, 512],
    'n_head': [4, 8, 12],
    'n_layer': [6, 10, 12],
}
dropout = 0.2
eval_iters = 50

# Function to create model with given hyperparameters
def create_model(vocab_size, n_embd, n_head, n_layer):
    class Head(nn.Module):
        def __init__(self, head_size):
            super().__init__()
            self.key = nn.Linear(n_embd, head_size, bias=False)
            self.query = nn.Linear(n_embd, head_size, bias=False)
            self.value = nn.Linear(n_embd, head_size, bias=False)
            self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
            self.dropout = nn.Dropout(dropout)

        def forward(self, x):
            B, T, C = x.shape
            k = self.key(x)
            q = self.query(x)
            wei = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5
            wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
            wei = F.softmax(wei, dim=-1)
            wei = self.dropout(wei)
            v = self.value(x)
            out = wei @ v
            return out

    class MultiHeadAttention(nn.Module):
        def __init__(self, num_heads, head_size):
            super().__init__()
            self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
            self.proj = nn.Linear(head_size * num_heads, n_embd)
            self.dropout = nn.Dropout(dropout)

        def forward(self, x):
            out = torch.cat([h(x) for h in self.heads], dim=-1)
            out = self.dropout(self.proj(out))
            return out

    class FeedForward(nn.Module):
        def __init__(self, n_embd):
            super().__init__()
            self.net = nn.Sequential(
                nn.Linear(n_embd, 4 * n_embd),
                nn.ReLU(),
                nn.Linear(4 * n_embd, n_embd),
                nn.Dropout(dropout),
            )

        def forward(self, x):
            return self.net(x)

    class Block(nn.Module):
        def __init__(self, n_embd, n_head):
            super().__init__()
            head_size = n_embd // n_head
            self.sa = MultiHeadAttention(n_head, head_size)
            self.ffws = FeedForward(n_embd)
            self.ln1 = nn.LayerNorm(n_embd)
            self.ln2 = nn.LayerNorm(n_embd)

        def forward(self, x):
            y = self.sa(x)
            x = self.ln1(x + y)
            y = self.ffws(x)
            x = self.ln2(x + y)
            return x

    class GptLanguageModel(nn.Module):
        def __init__(self, vocab_size):
            super().__init__()
            self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
            self.position_embedding_table = nn.Embedding(block_size, n_embd)
            self.block = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
            self.ln_f = nn.LayerNorm(n_embd)
            self.lm_head = nn.Linear(n_embd, vocab_size)
            self.apply(self._init_weights)

        def _init_weights(self, module):
            if isinstance(module, nn.Linear):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
                if module.bias is not None:
                    torch.nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

        def forward(self, index, targets=None):
            B, T = index.shape
            tok_emb = self.token_embedding_table(index)
            pos_emb = self.position_embedding_table(torch.arange(T, device=device))
            x = tok_emb + pos_emb
            x = self.block(x)
            x = self.ln_f(x)
            logits = self.lm_head(x)

            if targets is None:
                loss = None
            else:
                B, T, C = logits.shape
                logits = logits.view(B * T, C)
                targets = targets.view(B * T)
                loss = F.cross_entropy(logits, targets)
            return logits, loss

        def generate(self, index, max_new_tokens):
            for _ in range(max_new_tokens):
                logits, _ = self.forward(index)
                logits = logits[:, -1, :]
                probs = F.softmax(logits, dim=-1)
                index_next = torch.multinomial(probs, num_samples=1)
                index = torch.cat((index, index_next), dim=1)
            return index

    return GptLanguageModel(vocab_size).to(device)

# Define data functions
def get_random_chunk(split):
    filename = "train_split.txt" if split == 'train' else "val_split.txt"
    with open(filename, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            file_size = len(mm)
            start_pos = random.randint(0, file_size - block_size * batch_size)
            mm.seek(start_pos)
            block = mm.read(block_size * batch_size - 1)
            decoded_block = block.decode('utf-8', errors='ignore').replace('\r', '')
            data = torch.tensor(encode(decoded_block), dtype=torch.long)
    return data

def get_batch(split):
    data = get_random_chunk(split)
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix]).to(device)
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix]).to(device)
    return x, y

@torch.no_grad()
def estimate_loss(model, eval_iters):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    return out

# Load vocabulary
chars = ""
with open('vocab.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    chars = sorted(set(text))
vocab_size = len(chars)
print(f"Vocabulary size: {vocab_size}")

string_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_string = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

# Initialize results
best_loss = float('inf')
best_params = {}

# Grid search over hyperparameters
for params in itertools.product(*hyperparameter_grid.values()):
    batch_size, block_size, max_iters, learning_rate, n_embd, n_head, n_layer = params
    print(f"\nTesting combination: {params}")
    
    model = create_model(vocab_size, n_embd, n_head, n_layer)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    train_losses = []
    val_losses = []
    skip_combination = False

    for iter in range(max_iters):
        if iter % eval_iters == 0:
            losses = estimate_loss(model, eval_iters)
            train_losses.append(losses['train'])
            val_losses.append(losses['val'])
            print(f"step: {iter}, train loss: {losses['train']}, val loss: {losses['val']}")

            if iter ==  eval_iters and losses['train'] > 3:
                print("Skipping to next combination due to high train loss.")
                skip_combination = True
                break

        xb, yb = get_batch('train')
        logits, loss = model.forward(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    if skip_combination:
        continue

    print(f"Final loss: {loss.item()}")
    
    if loss.item() < best_loss:
        best_loss = loss.item()
        best_params = {
            'batch_size': batch_size,
            'block_size': block_size,
            'max_iters': max_iters,
            'learning_rate': learning_rate,
            'n_embd': n_embd,
            'n_head': n_head,
            'n_layer': n_layer
        }

# Save best parameters
with open('best_params.pkl', 'wb') as f:
    pickle.dump(best_params, f)

print(f"Best parameters: {best_params} with loss: {best_loss}")

# Plotting losses
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss') 
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Losses')
plt.show()


cuda
Vocabulary size: 32171

Testing combination: (16, 64, 200, 0.001, 256, 4, 6)
step: 0, train loss: 10.446089744567871, val loss: 10.447775840759277
step: 50, train loss: 3.255851984024048, val loss: 3.2606325149536133
Skipping to next combination due to high train loss.

Testing combination: (16, 64, 200, 0.001, 256, 4, 10)
step: 0, train loss: 10.464710235595703, val loss: 10.464530944824219
step: 50, train loss: 2.9358229637145996, val loss: 2.8663995265960693
step: 100, train loss: 2.7313382625579834, val loss: 2.751823663711548
step: 150, train loss: 2.7066102027893066, val loss: 2.6303164958953857
Final loss: 2.55635666847229

Testing combination: (16, 64, 200, 0.001, 256, 4, 12)
step: 0, train loss: 10.466771125793457, val loss: 10.461652755737305
step: 50, train loss: 2.9678618907928467, val loss: 3.0173540115356445
step: 100, train loss: 2.8077785968780518, val loss: 2.823378086090088
step: 150, train loss: 2.812192678451538, val loss: 2.723118305206299
Final loss: 2.984623

KeyboardInterrupt: 

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import mmap
import random
import pickle
import matplotlib.pyplot as plt
import torch.optim as optim
import itertools

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

# Hyperparameter grid
hyperparameter_grid = {
    'batch_size': [64],
    'block_size': [64, 128, 256],
    'max_iters': [200],
    'learning_rate': [1e-3],
    'n_embd': [256, 384, 512],
    'n_head': [4, 8, 12],
    'n_layer': [6, 10, 12],
}
dropout = 0.2
eval_iters = 50

# Function to create model with given hyperparameters
def create_model(vocab_size, n_embd, n_head, n_layer):
    class Head(nn.Module):
        def __init__(self, head_size):
            super().__init__()
            self.key = nn.Linear(n_embd, head_size, bias=False)
            self.query = nn.Linear(n_embd, head_size, bias=False)
            self.value = nn.Linear(n_embd, head_size, bias=False)
            self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
            self.dropout = nn.Dropout(dropout)

        def forward(self, x):
            B, T, C = x.shape
            k = self.key(x)
            q = self.query(x)
            wei = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5
            wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
            wei = F.softmax(wei, dim=-1)
            wei = self.dropout(wei)
            v = self.value(x)
            out = wei @ v
            return out

    class MultiHeadAttention(nn.Module):
        def __init__(self, num_heads, head_size):
            super().__init__()
            self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
            self.proj = nn.Linear(head_size * num_heads, n_embd)
            self.dropout = nn.Dropout(dropout)

        def forward(self, x):
            out = torch.cat([h(x) for h in self.heads], dim=-1)
            out = self.dropout(self.proj(out))
            return out

    class FeedForward(nn.Module):
        def __init__(self, n_embd):
            super().__init__()
            self.net = nn.Sequential(
                nn.Linear(n_embd, 4 * n_embd),
                nn.ReLU(),
                nn.Linear(4 * n_embd, n_embd),
                nn.Dropout(dropout),
            )

        def forward(self, x):
            return self.net(x)

    class Block(nn.Module):
        def __init__(self, n_embd, n_head):
            super().__init__()
            head_size = n_embd // n_head
            self.sa = MultiHeadAttention(n_head, head_size)
            self.ffws = FeedForward(n_embd)
            self.ln1 = nn.LayerNorm(n_embd)
            self.ln2 = nn.LayerNorm(n_embd)

        def forward(self, x):
            y = self.sa(x)
            x = self.ln1(x + y)
            y = self.ffws(x)
            x = self.ln2(x + y)
            return x

    class GptLanguageModel(nn.Module):
        def __init__(self, vocab_size):
            super().__init__()
            self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
            self.position_embedding_table = nn.Embedding(block_size, n_embd)
            self.block = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
            self.ln_f = nn.LayerNorm(n_embd)
            self.lm_head = nn.Linear(n_embd, vocab_size)
            self.apply(self._init_weights)

        def _init_weights(self, module):
            if isinstance(module, nn.Linear):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
                if module.bias is not None:
                    torch.nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

        def forward(self, index, targets=None):
            B, T = index.shape
            tok_emb = self.token_embedding_table(index)
            pos_emb = self.position_embedding_table(torch.arange(T, device=device))
            x = tok_emb + pos_emb
            x = self.block(x)
            x = self.ln_f(x)
            logits = self.lm_head(x)

            if targets is None:
                loss = None
            else:
                B, T, C = logits.shape
                logits = logits.view(B * T, C)
                targets = targets.view(B * T)
                loss = F.cross_entropy(logits, targets)
            return logits, loss

        def generate(self, index, max_new_tokens):
            for _ in range(max_new_tokens):
                logits, _ = self.forward(index)
                logits = logits[:, -1, :]
                probs = F.softmax(logits, dim=-1)
                index_next = torch.multinomial(probs, num_samples=1)
                index = torch.cat((index, index_next), dim=1)
            return index

    return GptLanguageModel(vocab_size).to(device)

# Define data functions
def get_random_chunk(split):
    filename = "train_split.txt" if split == 'train' else "val_split.txt"
    with open(filename, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            file_size = len(mm)
            start_pos = random.randint(0, file_size - block_size * batch_size)
            mm.seek(start_pos)
            block = mm.read(block_size * batch_size - 1)
            decoded_block = block.decode('utf-8', errors='ignore').replace('\r', '')
            data = torch.tensor(encode(decoded_block), dtype=torch.long)
    return data

def get_batch(split):
    data = get_random_chunk(split)
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix]).to(device)
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix]).to(device)
    return x, y

@torch.no_grad()
def estimate_loss(model, eval_iters):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    return out

# Load vocabulary
chars = ""
with open('vocab.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    chars = sorted(set(text))
vocab_size = len(chars)
print(f"Vocabulary size: {vocab_size}")

string_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_string = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

# Initialize results
best_loss = float('inf')
best_params = {}

# Grid search over hyperparameters
for params in itertools.product(*hyperparameter_grid.values()):
    batch_size, block_size, max_iters, learning_rate, n_embd, n_head, n_layer = params
    print(f"\nTesting combination: {params}")
    
    model = create_model(vocab_size, n_embd, n_head, n_layer)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    train_losses = []
    val_losses = []
    skip_combination = False

    for iter in range(max_iters):
        if iter % eval_iters == 0:
            losses = estimate_loss(model, eval_iters)
            train_losses.append(losses['train'])
            val_losses.append(losses['val'])
            print(f"step: {iter}, train loss: {losses['train']}, val loss: {losses['val']}")

            if iter ==  eval_iters and losses['train'] > 3:
                print("Skipping to next combination due to high train loss.")
                skip_combination = True
                break

        xb, yb = get_batch('train')
        logits, loss = model.forward(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    if skip_combination:
        continue

    print(f"Final loss: {loss.item()}")
    
    if loss.item() < best_loss:
        best_loss = loss.item()
        best_params = {
            'batch_size': batch_size,
            'block_size': block_size,
            'max_iters': max_iters,
            'learning_rate': learning_rate,
            'n_embd': n_embd,
            'n_head': n_head,
            'n_layer': n_layer
        }

# Save best parameters
with open('best_params.pkl', 'wb') as f:
    pickle.dump(best_params, f)

print(f"Best parameters: {best_params} with loss: {best_loss}")

# Plotting losses
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss') 
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Losses')
plt.show()


cuda
Vocabulary size: 32171

Testing combination: (64, 64, 200, 0.001, 256, 4, 6)
step: 0, train loss: 10.437539100646973, val loss: 10.436705589294434
step: 50, train loss: 2.767430067062378, val loss: 2.7613794803619385
step: 100, train loss: 2.755141258239746, val loss: 2.7092654705047607
step: 150, train loss: 2.510037899017334, val loss: 2.526036024093628
Final loss: 2.4095523357391357

Testing combination: (64, 64, 200, 0.001, 256, 4, 10)
step: 0, train loss: 10.437516212463379, val loss: 10.439190864562988
step: 50, train loss: 3.1073923110961914, val loss: 3.1381707191467285
Skipping to next combination due to high train loss.

Testing combination: (64, 64, 200, 0.001, 256, 4, 12)
step: 0, train loss: 10.43768310546875, val loss: 10.437600135803223
step: 50, train loss: 3.0319368839263916, val loss: 3.0540802478790283
Skipping to next combination due to high train loss.

Testing combination: (64, 64, 200, 0.001, 256, 8, 6)
step: 0, train loss: 10.434077262878418, val loss: 10.4

KeyboardInterrupt: 

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import mmap
import random
import pickle
import matplotlib.pyplot as plt
import torch.optim as optim
import itertools

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

# Hyperparameter grid
hyperparameter_grid = {
    'batch_size': [64],
    'block_size': [256],
    'max_iters': [200],
    'learning_rate': [1e-3],
    'n_embd': [256, 384, 512],
    'n_head': [4, 8, 12],
    'n_layer': [6, 10, 12],
}
dropout = 0.2
eval_iters = 50

# Function to create model with given hyperparameters
def create_model(vocab_size, n_embd, n_head, n_layer):
    class Head(nn.Module):
        def __init__(self, head_size):
            super().__init__()
            self.key = nn.Linear(n_embd, head_size, bias=False)
            self.query = nn.Linear(n_embd, head_size, bias=False)
            self.value = nn.Linear(n_embd, head_size, bias=False)
            self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
            self.dropout = nn.Dropout(dropout)

        def forward(self, x):
            B, T, C = x.shape
            k = self.key(x)
            q = self.query(x)
            wei = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5
            wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
            wei = F.softmax(wei, dim=-1)
            wei = self.dropout(wei)
            v = self.value(x)
            out = wei @ v
            return out

    class MultiHeadAttention(nn.Module):
        def __init__(self, num_heads, head_size):
            super().__init__()
            self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
            self.proj = nn.Linear(head_size * num_heads, n_embd)
            self.dropout = nn.Dropout(dropout)

        def forward(self, x):
            out = torch.cat([h(x) for h in self.heads], dim=-1)
            out = self.dropout(self.proj(out))
            return out

    class FeedForward(nn.Module):
        def __init__(self, n_embd):
            super().__init__()
            self.net = nn.Sequential(
                nn.Linear(n_embd, 4 * n_embd),
                nn.ReLU(),
                nn.Linear(4 * n_embd, n_embd),
                nn.Dropout(dropout),
            )

        def forward(self, x):
            return self.net(x)

    class Block(nn.Module):
        def __init__(self, n_embd, n_head):
            super().__init__()
            head_size = n_embd // n_head
            self.sa = MultiHeadAttention(n_head, head_size)
            self.ffws = FeedForward(n_embd)
            self.ln1 = nn.LayerNorm(n_embd)
            self.ln2 = nn.LayerNorm(n_embd)

        def forward(self, x):
            y = self.sa(x)
            x = self.ln1(x + y)
            y = self.ffws(x)
            x = self.ln2(x + y)
            return x

    class GptLanguageModel(nn.Module):
        def __init__(self, vocab_size):
            super().__init__()
            self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
            self.position_embedding_table = nn.Embedding(block_size, n_embd)
            self.block = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
            self.ln_f = nn.LayerNorm(n_embd)
            self.lm_head = nn.Linear(n_embd, vocab_size)
            self.apply(self._init_weights)

        def _init_weights(self, module):
            if isinstance(module, nn.Linear):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
                if module.bias is not None:
                    torch.nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

        def forward(self, index, targets=None):
            B, T = index.shape
            tok_emb = self.token_embedding_table(index)
            pos_emb = self.position_embedding_table(torch.arange(T, device=device))
            x = tok_emb + pos_emb
            x = self.block(x)
            x = self.ln_f(x)
            logits = self.lm_head(x)

            if targets is None:
                loss = None
            else:
                B, T, C = logits.shape
                logits = logits.view(B * T, C)
                targets = targets.view(B * T)
                loss = F.cross_entropy(logits, targets)
            return logits, loss

        def generate(self, index, max_new_tokens):
            for _ in range(max_new_tokens):
                logits, _ = self.forward(index)
                logits = logits[:, -1, :]
                probs = F.softmax(logits, dim=-1)
                index_next = torch.multinomial(probs, num_samples=1)
                index = torch.cat((index, index_next), dim=1)
            return index

    return GptLanguageModel(vocab_size).to(device)

# Define data functions
def get_random_chunk(split):
    filename = "train_split.txt" if split == 'train' else "val_split.txt"
    with open(filename, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            file_size = len(mm)
            start_pos = random.randint(0, file_size - block_size * batch_size)
            mm.seek(start_pos)
            block = mm.read(block_size * batch_size - 1)
            decoded_block = block.decode('utf-8', errors='ignore').replace('\r', '')
            data = torch.tensor(encode(decoded_block), dtype=torch.long)
    return data

def get_batch(split):
    data = get_random_chunk(split)
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix]).to(device)
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix]).to(device)
    return x, y

@torch.no_grad()
def estimate_loss(model, eval_iters):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    return out

# Load vocabulary
chars = ""
with open('vocab.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    chars = sorted(set(text))
vocab_size = len(chars)
print(f"Vocabulary size: {vocab_size}")

string_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_string = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

# Initialize results
best_loss = float('inf')
best_params = {}

# Grid search over hyperparameters
for params in itertools.product(*hyperparameter_grid.values()):
    batch_size, block_size, max_iters, learning_rate, n_embd, n_head, n_layer = params
    print(f"\nTesting combination: {params}")
    
    model = create_model(vocab_size, n_embd, n_head, n_layer)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    train_losses = []
    val_losses = []
    skip_combination = False

    for iter in range(max_iters):
        if iter % eval_iters == 0:
            losses = estimate_loss(model, eval_iters)
            train_losses.append(losses['train'])
            val_losses.append(losses['val'])
            print(f"step: {iter}, train loss: {losses['train']}, val loss: {losses['val']}")

            if iter ==  eval_iters and losses['train'] > 3:
                print("Skipping to next combination due to high train loss.")
                skip_combination = True
                break

        xb, yb = get_batch('train')
        logits, loss = model.forward(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    if skip_combination:
        continue

    print(f"Final loss: {loss.item()}")
    
    if loss.item() < best_loss:
        best_loss = loss.item()
        best_params = {
            'batch_size': batch_size,
            'block_size': block_size,
            'max_iters': max_iters,
            'learning_rate': learning_rate,
            'n_embd': n_embd,
            'n_head': n_head,
            'n_layer': n_layer
        }

# Save best parameters
with open('best_params.pkl', 'wb') as f:
    pickle.dump(best_params, f)

print(f"Best parameters: {best_params} with loss: {best_loss}")

# Plotting losses
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss') 
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Losses')
plt.show()


cuda
Vocabulary size: 32171

Testing combination: (64, 256, 200, 0.001, 256, 4, 6)
step: 0, train loss: 10.452496528625488, val loss: 10.453695297241211
step: 50, train loss: 2.702472448348999, val loss: 2.698342800140381


KeyboardInterrupt: 