In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import numpy as np
import math
import random
from torch.optim import lr_scheduler



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
checkpoint_path = '/content/drive/MyDrive/gpt_checkpoint.pth'


In [None]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [None]:
# Download the dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

# Read the dataset
with open('input.txt', 'r') as f:
    data = f.read()

print(f"Dataset length: {len(data)} characters")


--2024-12-11 17:29:54--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.3’


2024-12-11 17:29:54 (18.1 MB/s) - ‘input.txt.3’ saved [1115394/1115394]

Dataset length: 1115394 characters


In [None]:
class CharDataset(Dataset):
    """
    Character-level Dataset for Shakespeare text.
    """

    def __init__(self, data, block_size):
        # Get all unique characters in the data
        chars = sorted(list(set(data)))
        self.stoi = { ch:i for i,ch in enumerate(chars) }  # Mapping from character to index
        self.itos = { i:ch for i,ch in enumerate(chars) }  # Mapping from index to character
        self.vocab_size = len(chars)
        self.block_size = block_size
        self.data = data
        self.tokenized_data = [self.stoi[c] for c in data]  # Convert all data to indices

    def __len__(self):
        # Total number of samples
        return len(self.tokenized_data) - self.block_size

    def __getitem__(self, idx):
        # Get a chunk of (block_size + 1) characters
        chunk = self.tokenized_data[idx:idx + self.block_size + 1]
        # Input is first n characters, target is next n characters
        x = torch.tensor(chunk[:-1], dtype=torch.long)
        y = torch.tensor(chunk[1:], dtype=torch.long)
        return x, y

    def get_vocab_size(self):
        return self.vocab_size

    def decode(self, idx_list):
        # Convert a list of indices back to a string
        return ''.join([self.itos[i] for i in idx_list])


In [None]:

# Hyperparameters
block_size = 256  # Length of each input sequence
batch_size = 128  # Number of sequences per batch

# Create dataset instance
dataset = CharDataset(data, block_size)
vocab_size = dataset.get_vocab_size()
print(f"Vocab size: {vocab_size}")

chunk_size = 1000
buffer_size = 50

chunks = []
for i in range(0, len(data), chunk_size + buffer_size):
    chunk = data[i:i+chunk_size]
    if len(chunk) == chunk_size:
        chunks.append(chunk)


split_idx = int(0.9 * len(chunks))
train_chunks = chunks[:split_idx]
val_chunks = chunks[split_idx:]


train_data = ''.join(train_chunks)
val_data = ''.join(val_chunks)

def augment_data(data, prob=0.04):
    augmented = []
    for char in data:
        if random.random() < prob:

            augmented.append(random.choice(list(set(data))))
        else:
            augmented.append(char)
    return ''.join(augmented)


train_data = augment_data(train_data)


train_dataset = CharDataset(train_data, block_size)
val_dataset = CharDataset(val_data, block_size)


from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

Vocab size: 65


In [None]:
class Config:
    vocab_size = vocab_size
    n_embed = 768       # Embedding dimension
    n_head = 8          # Number of attention heads
    n_layer = 12        # Number of transformer blocks
    block_size = block_size
    dropout = 0.3       # Dropout rate


In [None]:
class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embed % config.n_head == 0

        self.n_head = config.n_head
        self.head_dim = config.n_embed // config.n_head

        self.key = nn.Linear(config.n_embed, config.n_embed)
        self.query = nn.Linear(config.n_embed, config.n_embed)
        self.value = nn.Linear(config.n_embed, config.n_embed)
        self.attn_drop = nn.Dropout(config.dropout)
        self.proj = nn.Linear(config.n_embed, config.n_embed)
        self.proj_drop = nn.Dropout(config.dropout)

        # Causal mask to ensure attention only to previous positions
        self.register_buffer("mask", torch.tril(torch.ones(config.block_size, config.block_size))
                                        .unsqueeze(0).unsqueeze(0))

    def forward(self, x):
        B, T, C = x.size()

        # Linear projections
        k = self.key(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
        q = self.query(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
        v = self.value(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)

        # Scaled dot-product attention
        attn_weights = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)

        # Apply causal mask
        attn_weights = attn_weights.masked_fill(self.mask[:, :, :T, :T] == 0, float('-inf'))

        attn_probs = F.softmax(attn_weights, dim=-1)
        attn_probs = self.attn_drop(attn_probs)

        y = attn_probs @ v  # Combine attention and values
        y = y.transpose(1, 2).contiguous().view(B, T, C)

        y = self.proj(y)
        y = self.proj_drop(y)
        return y


In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.n_embed)
        self.ln2 = nn.LayerNorm(config.n_embed)
        self.attn = CausalSelfAttention(config)
        self.mlp = nn.Sequential(
            nn.Linear(config.n_embed, 4 * config.n_embed),
            nn.GELU(),
            nn.Linear(4 * config.n_embed, config.n_embed),
            nn.Dropout(config.dropout),
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x))  # Residual connection around attention
        x = x + self.mlp(self.ln2(x))   # Residual connection around MLP
        return x


In [None]:
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.token_embedding_table = nn.Embedding(config.vocab_size, config.n_embed)
        self.position_embedding_table = nn.Embedding(config.block_size, config.n_embed)
        self.dropout = nn.Dropout(config.dropout)

        self.blocks = nn.Sequential(*[TransformerBlock(config) for _ in range(config.n_layer)])
        self.ln_f = nn.LayerNorm(config.n_embed)
        self.lm_head = nn.Linear(config.n_embed, config.vocab_size)

        self.apply(self._init_weights)

        print(f"Number of parameters: {sum(p.numel() for p in self.parameters())}")

    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.size()

        # Token and position embeddings
        token_embeddings = self.token_embedding_table(idx)
        position_ids = torch.arange(T, device=idx.device).unsqueeze(0)
        position_embeddings = self.position_embedding_table(position_ids)

        x = self.dropout(token_embeddings + position_embeddings)
        x = self.blocks(x)
        x = self.ln_f(x)

        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            # Compute cross-entropy loss
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))

        return logits, loss


    def generate(self, idx, max_new_tokens):

        for _ in range(max_new_tokens):

            idx_cond = idx[:, -self.config.block_size:]  # Ensure input is within block size
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]  # Focus on the last time step
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, next_token), dim=1)
        return idx


In [None]:
print(GPT.generate.__doc__)


None


In [None]:
config = Config()
config.vocab_size = len(train_dataset.stoi)
model = GPT(config).to(device)

Number of parameters: 85352513


In [None]:
#def load_model(path, model):
    #if os.path.exists(path):
     #   model.load_state_dict(torch.load(path))
      #  print(f'Model loaded from {path}')
   # else:
       # print(f'No checkpoint found at {path}')
    #return model

#model = load_model(checkpoint_path, model)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5,weight_decay=1e-4)


In [None]:

from torch.optim.lr_scheduler import ReduceLROnPlateau

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)


In [None]:
from tqdm import tqdm
from torch.amp import autocast, GradScaler
scaler = GradScaler(init_scale=2.0)
num_epochs = 10
best_val_loss = float('inf')
patience = 2
trigger_times = 0
loss_fn = torch.nn.CrossEntropyLoss(label_smoothing=0.1)

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for x, y in tqdm(train_loader):
        x = x.to(device)
        y = y.to(device)

        optimizer.zero_grad()
        with autocast(device_type='cuda'):
            logits, loss = model(x, y)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()


        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)

    torch.save(model.state_dict(), checkpoint_path)
    print(f'The model has been saved to： {checkpoint_path}')

    # Evaluate on validation set
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for x, y in val_loader:
            x = x.to(device)
            y = y.to(device)

            logits, loss = model(x, y)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")


    scheduler.step(avg_val_loss)


    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        trigger_times = 0

        torch.save(model.state_dict(), 'best_model.pth')
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print('Early stopping!')
            break


100%|██████████| 7458/7458 [29:59<00:00,  4.14it/s]


模型已保存到 /content/drive/MyDrive/gpt_checkpoint.pth
Epoch 1/10, Training Loss: 2.9658, Validation Loss: 4.3851


100%|██████████| 7458/7458 [30:00<00:00,  4.14it/s]


模型已保存到 /content/drive/MyDrive/gpt_checkpoint.pth
Epoch 2/10, Training Loss: 3.1321, Validation Loss: 4.2487


100%|██████████| 7458/7458 [30:01<00:00,  4.14it/s]


模型已保存到 /content/drive/MyDrive/gpt_checkpoint.pth
Epoch 3/10, Training Loss: 2.9470, Validation Loss: 4.5903


100%|██████████| 7458/7458 [29:59<00:00,  4.14it/s]


模型已保存到 /content/drive/MyDrive/gpt_checkpoint.pth
Epoch 4/10, Training Loss: 3.2772, Validation Loss: 4.1892


100%|██████████| 7458/7458 [30:01<00:00,  4.14it/s]


模型已保存到 /content/drive/MyDrive/gpt_checkpoint.pth
Epoch 5/10, Training Loss: 2.9157, Validation Loss: 4.6900


100%|██████████| 7458/7458 [30:02<00:00,  4.14it/s]


模型已保存到 /content/drive/MyDrive/gpt_checkpoint.pth
Epoch 6/10, Training Loss: 2.4636, Validation Loss: 4.9849
Early stopping!


In [None]:
model.eval()
with torch.no_grad():
    context = "O God, O God!"
    context_idx = torch.tensor([dataset.stoi[c] for c in context], dtype=torch.long).unsqueeze(0).to(device)
    generated_idx = model.generate(context_idx, max_new_tokens=500)[0].tolist()
    completion = dataset.decode(generated_idx)
    print(completion)


O God, O God!
LORK:
Prat that come thou upee? O, who see yin him; all there.

FRIV:
'Thas speast we boer's ancuse Yizes.

FLORD:You, bearqut wath delus .lke feefe! tor beatt Bloord;
Wntagh I on spilege? ther side grom;
This for your fothe both Frim.

HENRY MIO:
Shan well if thow detustutor?

CAMILLLLLO:
Gos to seeees broveds? earje the stes hose:
What your the nim Cuppt and thou ford tate;
Cuchan sir, al let befor is sSrliestifed prowng,
That mist soomet dith.'

CORIOLANUS:
Nless? lik, shisch loom, andgean
O


In [None]:
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs * len(train_loader))


In [None]:
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)


tensor(0.3698, device='cuda:0')

In [None]:
def compute_perplexity(model, data_loader):
    model.eval()
    total_loss = 0
    total_tokens = 0
    with torch.no_grad():
        for x, y in data_loader:
            x = x.to(device)
            y = y.to(device)
            logits, loss = model(x, y)
            total_loss += loss.item() * x.size(0) * x.size(1)
            total_tokens += x.size(0) * x.size(1)

    avg_loss = total_loss / total_tokens
    perplexity = math.exp(avg_loss)
    return perplexity

val_perplexity = compute_perplexity(model, val_loader)
print(f"Validation Perplexity: {val_perplexity:.2f}")


Validation Perplexity: 146.19


In [None]:
# Save the model
torch.save(model.state_dict(), 'gpt_shakespeare.pth')

In [None]:
# Load the model
model.load_state_dict(torch.load('gpt_shakespeare.pth'))


  model.load_state_dict(torch.load('gpt_shakespeare.pth'))


<All keys matched successfully>