In [None]:
# Import Modules

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch.optim as optim

In [None]:
# Define a LoRA class

class LoRAAdapter(nn.Module):
    def __init__(self, hidden_size, r=4, alpha=16):
        super().__init__()
        self.r = r              # Rank
        self.alpha = alpha      # To get the scaling factor
        self.scaling = self.alpha / self.r
        self.A = nn.Parameter(torch.randn(r, hidden_size) * 0.01)  # Downsampler
        self.B = nn.Parameter(torch.randn(hidden_size, r) * 0.01)  # Upsampler

    def forward(self, x):
        return (x @ self.A.T @ self.B.T) * self.scaling            # Forward Function

In [None]:
# Adding LoRA to model at each layer

def add_lora_to_gpt2(model, r=4, alpha=16):
    for block in model.transformer.h:                       # Accessing each decoder block
        hidden_size = block.attn.embed_dim                 
        lora = LoRAAdapter(hidden_size, r, alpha)           # Call the LoRA Class with the embedding size as it is the same for Q,K,V

        def hook(module, input, output, lora = lora):       # A hook to be defined so that it resides in cache and no need to define it always
            hidden = input[0]                               # Pass the input tensor
            q, k, v = output.split(hidden_size, dim = 2)    # Extract the Q,K,V weights from the attention block
            q = q + lora(hidden)                            # Add the Q and LoRA result
            v = v + lora(hidden)                            # Add the V and LoRA result
            return torch.cat([q, k, v], dim = 2)            # Concatenate the final result

        block.attn.c_attn.register_forward_hook(hook)       # Register the hook
        block.attn.lora = lora                              # Save the LoRA weights

In [None]:
# Define a Dataset and DataLoader for batching of inputs

class JokeDataset(Dataset):
    def __init__(self, tokenizer, jokes, block_size = 64):      # We pass a tokeniser to generate tokens for each sentence in the dataset
        self.inputs = []
        for joke in jokes:
            enc = tokenizer(joke, truncation = True, max_length = block_size, return_tensors='pt')
            self.inputs.append(enc.input_ids.squeeze(0))

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
         return self.inputs[idx], self.inputs[idx]

def collate_fn(batch):                                          # Makes use of Padding with a particular token for variable length sequences
    inputs, targets = zip(*batch)
    inputs = nn.utils.rnn.pad_sequence(inputs, batch_first = True, padding_value = tokenizer.pad_token_id)
    return inputs, inputs

In [None]:
# Define a global token (here EOS) to pad the variable length sequences

tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained("distilgpt2")
add_lora_to_gpt2(model, r=4, alpha=16)

for name, param in model.named_parameters():       # Freeze all the trainable parameters
    param.requires_grad = False
    
for block in model.transformer.h:
    for param in block.attn.lora.parameters():     # Only unfreeze the LoRA parameters present in attention layer
        param.requires_grad = True

In [None]:
# A small dataset of sentences with jokes

jokes = [
    "Why did the chicken join a band? Because it had the drumsticks!",
    "I told my computer I needed a break, and it said 'No problem, I'll go to sleep.'",
    "Why don't scientists trust atoms? Because they make up everything!",
    "Parallel lines have so much in common. It’s a shame they’ll never meet."
]

dataset = JokeDataset(tokenizer, jokes)
loader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn = collate_fn)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

# Apply an optimiser but only for parameters which are made trainable (i.e. here only for LoRA parameters)
optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)

# Train the fine-tuned model for 5 epochs
for epoch in range(5):
    for inputs, targets in loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs, labels = targets)
        loss = outputs.loss                                        # Computes the loss (Cross-Entropy)
        loss.backward()                                            # Backpropagation
        optimizer.step()                                   
        optimizer.zero_grad()
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

lora_state = {
    f"block_{i}_lora": block.attn.lora.state_dict()
    for i, block in enumerate(model.transformer.h)
}
torch.save(lora_state, "lora_adapters.pt")                         # Save the LoRA Parameters for future use


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch 0, Loss: 5.4883
Epoch 0, Loss: 5.9421
Epoch 1, Loss: 5.4231
Epoch 1, Loss: 6.0338
Epoch 2, Loss: 6.0228
Epoch 2, Loss: 5.5075
Epoch 3, Loss: 5.8277
Epoch 3, Loss: 5.5118
Epoch 4, Loss: 5.3165
Epoch 4, Loss: 3.6309


In [None]:
# Model evaluation

model.eval()
input_ids = tokenizer("Why did", return_tensors="pt").input_ids.to(device)
output = model.generate(input_ids, max_length=40, do_sample=True)
print(tokenizer.decode(output[0]))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Why did Trump’s candidacy work? Did it work for the voters who cared about him? Did he lose voters or did he just lose people?





I have an


In [None]:
# Get the output using the base model

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
prompt = "Why did the chicken cross the road?"

tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

original_model = GPT2LMHeadModel.from_pretrained("distilgpt2").to(device)
original_model.eval()

input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
original_output = original_model.generate(input_ids, max_length=50, do_sample=True)
original_text = tokenizer.decode(original_output[0], skip_special_tokens=True)

print("\n Original GPT-2 Output:")
print(original_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



 Original GPT-2 Output:
Why did the chicken cross the road?‬This is an interesting question: Why is there a way that a chicken can cross a road even if the chicken didn‬then, so could an animal not cross a road? Does the dog walk


In [None]:
# Use the LoRA weights and use the same input question on this fine-tuned model

lora_model = GPT2LMHeadModel.from_pretrained("distilgpt2")
add_lora_to_gpt2(lora_model, r=4, alpha=16)

for name, param in lora_model.named_parameters():
    param.requires_grad = False

lora_state = torch.load("lora_adapters.pt", map_location = device)
for i, block in enumerate(lora_model.transformer.h):
    block.attn.lora.load_state_dict(lora_state[f"block_{i}_lora"])

lora_model.to(device)
lora_model.eval()

lora_output = lora_model.generate(input_ids, max_length=50, do_sample=True)
lora_text = tokenizer.decode(lora_output[0], skip_special_tokens=True)

print("\n LoRA Fine-Tuned Output:")
print(lora_text)


  lora_state = torch.load("lora_adapters.pt", map_location = device)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



 LoRA Fine-Tuned Output:
Why did the chicken cross the road? And did the chickens even cross the road? And did the chicken fly? Did the chicken cross the road? Did the chickens even cross the road? And did the chicken fly? Did the chicken fly?

