<a href="https://colab.research.google.com/github/saipragna25/Special_topics-Transformers-and-finetuning-with-LLMs-Assignment/blob/main/ST_Transformers_and_finetuning_with_LLMs_A_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import requests


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Downloading the "Alice's Adventures in Wonderland" dataset
url = "https://www.gutenberg.org/files/11/11-0.txt"
response = requests.get(url)
text = response.text

# Displaying basic details about the text
print("length of dataset in characters: ", len(text))
print("\nFirst 1000 characters:\n", text[:1000])

# Unique characters in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Mapping from characters to integers and vice versa
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }

# Encoding and Decoding functions
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

# Encoding the entire dataset
data = torch.tensor(encode(text), dtype=torch.long)

# Splitting the data into train and validation sets
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

print("\nUnique characters:", ''.join(chars))
print("\nVocab size:", vocab_size)

length of dataset in characters:  174313

First 1000 characters:
 ï»¿The Project Gutenberg eBook of Aliceâs Adventures in Wonderland, by Lewis Carroll

This eBook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this eBook or online at
www.gutenberg.org. If you are not located in the United States, you
will have to check the laws of the country where you are located before
using this eBook.

Title: Aliceâs Adventures in Wonderland

Author: Lewis Carroll

Release Date: January, 1991 [eBook #11]
[Most recently updated: October 12, 2020]

Language: English

Character set encoding: UTF-8

Produced by: Arthur DiBianca and David Widger

*** START OF THE PROJECT GUTENBERG EBOOK ALICEâS ADVENTURES IN WONDERLAND ***

[Illustration]




Aliceâs Adventures in Wo

In [None]:
# ---------------------- MODEL DEFINITION ----------------------

class GPTBlock(nn.Module):
    def __init__(self, embed_size, num_heads):
        super(GPTBlock, self).__init__()
        self.attention = nn.MultiheadAttention(embed_size, num_heads)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, 4 * embed_size),
            nn.GELU(),
            nn.Linear(4 * embed_size, embed_size)
        )
        self.ln1 = nn.LayerNorm(embed_size)
        self.ln2 = nn.LayerNorm(embed_size)

    def forward(self, x, mask):
        attn_out, _ = self.attention(x, x, x, attn_mask=mask)
        x = self.ln1(x + attn_out)
        ff_out = self.feed_forward(x)
        return self.ln2(x + ff_out)

class NanoGPT(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_blocks):
        super(NanoGPT, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.blocks = nn.ModuleList([GPTBlock(embed_size, num_heads) for _ in range(num_blocks)])
        self.fc = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        mask = torch.triu(torch.ones(len(x), len(x)), diagonal=1).bool().to(x.device)
        for block in self.blocks:
            x = block(x, mask)
        return self.fc(x)

In [None]:

# ---------------------- DATA LOADER ----------------------

class TextDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        return (self.data[idx:idx+self.block_size], self.data[idx+1:idx+self.block_size+1])

BLOCK_SIZE = 128
BATCH_SIZE = 32

train_dataset = TextDataset(train_data, BLOCK_SIZE)
val_dataset = TextDataset(val_data, BLOCK_SIZE)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [None]:
EMBED_SIZE = 256
HIDDEN_SIZE = 256  # This is typically set to the same value as EMBED_SIZE for simplicity
NUM_LAYERS = 2  # Number of LSTM layers; adjust if desired
vocab_size = len(itos)


In [None]:
# Adjusted NanoGPT model with dropout and layer normalization
class NanoGPTWithRegularization(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout_prob=0.5):
        super(NanoGPTWithRegularization, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

        # Add dropout and layer normalization
        self.dropout = nn.Dropout(dropout_prob)
        self.layer_norm = nn.LayerNorm(hidden_size)
    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout(x)  # Apply dropout after embedding
        lstm_out, _ = self.lstm(x)
        lstm_out = self.layer_norm(lstm_out)  # Apply layer normalization
        x = self.fc(lstm_out)
        return x


In [None]:
# ---------------------- TRAINING LOOP ----------------------

EMBED_SIZE = 256
NUM_HEADS = 4
NUM_BLOCKS = 2
EPOCHS = 10
LR = 0.001

model = NanoGPT(vocab_size, EMBED_SIZE, NUM_HEADS, NUM_BLOCKS)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

def train(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def validate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
            total_loss += loss.item()
    return total_loss / len(dataloader)

train_losses = []
val_losses = []

for epoch in range(EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion)
    val_loss = validate(model, val_loader, criterion)
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    print(f"Epoch {epoch+1}/{EPOCHS} - Train loss: {train_loss:.4f}, Val loss: {val_loss:.4f}")


Epoch 1/10 - Train loss: 2.3025, Val loss: 2.9730
Epoch 2/10 - Train loss: 2.2948, Val loss: 3.0051
Epoch 3/10 - Train loss: 2.2933, Val loss: 3.0330
Epoch 4/10 - Train loss: 2.2926, Val loss: 3.0569
Epoch 5/10 - Train loss: 2.2918, Val loss: 3.0573
Epoch 6/10 - Train loss: 2.2914, Val loss: 3.0404
Epoch 7/10 - Train loss: 2.2911, Val loss: 3.0550
Epoch 8/10 - Train loss: 2.2908, Val loss: 3.0433
Epoch 9/10 - Train loss: 2.2904, Val loss: 3.0585
Epoch 10/10 - Train loss: 2.2904, Val loss: 3.0590


Add the Fine-tuning Dataset Preparation:

In [None]:
fine_tune_dataset = TextDataset(val_data, BLOCK_SIZE)
fine_tune_loader = DataLoader(fine_tune_dataset, batch_size=BATCH_SIZE, shuffle=True)


Adjust Training Parameters:

In [None]:
FINE_TUNE_EPOCHS = 5
FINE_TUNE_LR = 0.0001
optimizer = torch.optim.Adam(model.parameters(), lr=FINE_TUNE_LR)


Fine-tuning Training Loop:

In [None]:
for epoch in range(FINE_TUNE_EPOCHS):
    train_loss = train(model, fine_tune_loader, optimizer, criterion)
    print(f"Fine-tuning Epoch {epoch+1}/{FINE_TUNE_EPOCHS} - Train loss: {train_loss:.4f}")

Fine-tuning Epoch 1/5 - Train loss: 2.6301
Fine-tuning Epoch 2/5 - Train loss: 2.4223
Fine-tuning Epoch 3/5 - Train loss: 2.3842
Fine-tuning Epoch 4/5 - Train loss: 2.3692
Fine-tuning Epoch 5/5 - Train loss: 2.3608


In [None]:
# Initialize the adjusted model
model_regularized = NanoGPTWithRegularization(vocab_size, EMBED_SIZE, HIDDEN_SIZE, NUM_LAYERS).to(device)

# Loss and optimizer
optimizer = torch.optim.Adam(model_regularized.parameters(), lr=0.001)

# Fine-tuning the regularized model for additional epochs
additional_epochs = 10
for epoch in range(additional_epochs):
    train_loss = train(model_regularized, fine_tune_loader, optimizer, criterion)
    print(f"Fine-tuning (Regularized) Epoch {epoch+1}/{additional_epochs} - Train loss: {train_loss:.4f}")


Fine-tuning (Regularized) Epoch 1/10 - Train loss: 0.8341
Fine-tuning (Regularized) Epoch 2/10 - Train loss: 0.1255
Fine-tuning (Regularized) Epoch 3/10 - Train loss: 0.0911
Fine-tuning (Regularized) Epoch 4/10 - Train loss: 0.0816
Fine-tuning (Regularized) Epoch 5/10 - Train loss: 0.0768
Fine-tuning (Regularized) Epoch 6/10 - Train loss: 0.0743
Fine-tuning (Regularized) Epoch 7/10 - Train loss: 0.0722
Fine-tuning (Regularized) Epoch 8/10 - Train loss: 0.0706
Fine-tuning (Regularized) Epoch 9/10 - Train loss: 0.0692
Fine-tuning (Regularized) Epoch 10/10 - Train loss: 0.0682


In [None]:
def generate_text(model, seed_text, gen_length=100, temperature=1.0):
    model.eval()
    with torch.no_grad():
        # Encode seed text
        input_ids = torch.tensor([stoi[c] for c in seed_text], dtype=torch.long).unsqueeze(0).to(device)

        # Generate text
        for _ in range(gen_length):
            outputs = model(input_ids)
            logits = outputs[:, -1, :]
            probs = nn.functional.softmax(logits / temperature, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            input_ids = torch.cat([input_ids, next_token], dim=1)

        # Decode the generated text
        generated_text = ''.join([itos[int(idx)] for idx in input_ids[0]])
    return generated_text

# Generate text with the trained model
seed = "Alice"
generated_sequence = generate_text(model_regularized, seed_text=seed, gen_length=1000)
print(generated_sequence)

Alicensed works that can be
freely distributed in machine-readable form accessible by the widest
array of equipment including outdated equipment. Many small donations
($1 to $5,000) are particularly important to maintaining tax exempt
status with the IRS.

The Foundation is committed to complying with the laws regulating
charities and charitable donations in all 50 states of the United
States. Compliance requirements are not uniform and it takes a
considerable effort, much paperwork and many fees to meet and keep up
with these requirements. We do nations to the Project Gutenberg
  Literary Archive Foundation."

* You provide a full refund of any money paid by a user who notifies
  you in writing (or by e-mail) within 30 days of receipt that s/he
  does not agree to the user, processing or hypertext form. However, if you provide access
to or distribute copies of a Project Gutenberg-tm work in a format
other than "Plain Vanilla ASCII" or other format used in the official