In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
import os
if not os.path.exists("/scratch/sanika"):
    os.makedirs("/scratch/sanika")

os.environ["HF_HOME"] = "/scratch/sanika/"

In [3]:
# Load model directly
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config

In [6]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2-large")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2-large")
model

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-35): 36 x GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=3840, nx=1280)
          (c_proj): Conv1D(nf=1280, nx=1280)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=5120, nx=1280)
          (c_proj): Conv1D(nf=1280, nx=5120)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1280, out_features=50257, bias=False)
)

In [4]:
# load the model (gpt2large)
model = GPT2LMHeadModel.from_pretrained('gpt2-large')

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-35): 36 x GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=3840, nx=1280)
          (c_proj): Conv1D(nf=1280, nx=1280)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=5120, nx=1280)
          (c_proj): Conv1D(nf=1280, nx=5120)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1280, out_features=50257, bias=False)
)

In [None]:
class GPT2LayerSkipModel(nn.Module):
    

In [None]:
import torch
from torch import nn
from transformers import T5ForConditionalGeneration
import inspect

class T5LayerSkipModel(nn.Module):
    def __init__(self, config, decoder_dropout_rates):
        super().__init__()
        self.config = config

        self.model = T5ForConditionalGeneration(config)
        self.encoder = self.model.encoder
        self.decoder = self.model.decoder

        self.decoder_dropout_rates = decoder_dropout_rates

        self.lm_head = torch.nn.Linear(config.d_model, config.vocab_size, bias=False)
    
    def forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask):
        # Encoder forward pass
        encoder_hidden_states = self.encoder(input_ids, attention_mask=attention_mask)
        encoder_outputs = encoder_hidden_states.last_hidden_state

        decoder_outputs = encoder_outputs

        # Decoder initial hidden states (embeddings of decoder input IDs)        
        all_layer_outputs = []

        # Process each decoder layer
        for i, layer in enumerate(self.decoder.block):
            if torch.rand(1).item() > self.decoder_dropout_rates[i]:

                cache_position = torch.zeros(decoder_input_ids.shape[0], dtype=torch.long).to(input_ids.device)


                layer_outputs = layer(
                    hidden_states = decoder_outputs,
                    attention_mask = decoder_attention_mask,
                    encoder_hidden_states = encoder_outputs,
                    encoder_attention_mask = attention_mask,
                    use_cache=False,
                    cache_position = cache_position
                )
                decoder_outputs = layer_outputs[0]  # Updated hidden states from this layer

            logits = self.lm_head(decoder_outputs)  # Compute logits
            all_layer_outputs.append(logits)

        return all_layer_outputs


In [89]:
config = T5Config.from_pretrained(model_name)
decoder_dropout_rates = [0.1 * (i + 1) for i in range(config.num_layers)]

model = T5LayerSkipModel(config, decoder_dropout_rates)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

weights = torch.linspace(1, len(decoder_dropout_rates), len(decoder_dropout_rates)) / len(decoder_dropout_rates)


In [6]:
tokenizer = T5Tokenizer.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
input_text = "translate English to French: The book is on the table."
decoder_text = "Le livre est sur la table."

inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
targets = tokenizer(decoder_text, return_tensors="pt", padding=True, truncation=True)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [8]:
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
decoder_input_ids = targets["input_ids"]
decoder_attention_mask = targets["attention_mask"]

In [85]:
encoder = model.encoder
decoder = model.decoder

In [11]:
def compute_layer_skip_loss(layer_outputs, targets, criterion, weights):
    total_loss = 0

    for i, logits in enumerate(layer_outputs):
        print(logits.shape)
        loss = criterion(logits.view(-1, logits.size(-1)), targets.view(-1))
        total_loss += weights[i] * loss

    return total_loss

In [90]:
for epoch in range(3):
    model.train()
    optimizer.zero_grad()

    layer_outputs = model(input_ids, attention_mask, decoder_input_ids, decoder_attention_mask)
    print(targets["input_ids"].shape)
    print(weights.shape)
    loss = compute_layer_skip_loss(layer_outputs, targets["input_ids"], criterion, weights)
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

IndexError: too many indices for tensor of dimension 2