In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch
import numpy as np

# BERT MLM

In [None]:
bert = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
tokenizer  = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
data = []
for _ in range(10):
    input_ids = torch.randint(0, len(tokenizer), (16f, 256))
    data.append(input_ids)


In [None]:
bert.cuda();

In [None]:
import time

In [None]:
optimizer = torch.optim.AdamW(bert.parameters(), lr=1e-3)

start = time.time()

for input_ids in data:
    out = bert(input_ids=input_ids.cuda(), labels=input_ids.cuda())
    out.loss.backward()
    optimizer.step()
end = time.time()
print((end-start) / 10.0)

In [None]:
print("Time for 1M steps (in days):", int(1e6 * (end-start) / 10 / 3600 / 24))

# Gradient accumulation

In [None]:
def f(x):
    return (0.5 * x**2).sum()

Here we have:
\begin{equation}
\dfrac{\partial}{\partial x_i} f(x) = \dfrac{\partial}{\partial x_i}  \sum_{i=1}^n\dfrac{1}{2} x_i^2 = x_i.
\end{equation}

In [None]:
x = torch.arange(10, dtype=float, requires_grad=True)
x

In [None]:
x = torch.arange(10, dtype=float, requires_grad=True)

loss = f(x)
loss.backward()
print("Gradient attached to x:", x.grad)

In [None]:
x = torch.arange(10, dtype=float, requires_grad=True)
print("Initial gradient:", x.grad)
for i in range(2):
    loss = f(x)
    loss.backward()
    input("Continue")
    print(f"Gradient attached to x at step {i+1}:", x.grad)

A gradient accumulation is readily performed like:

In [None]:
# bert = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
tokenizer  = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
data = []
batch_size = 2
for _ in range(10):
    input_ids = torch.randint(0, len(tokenizer), (batch_size, 8))
    data.append(input_ids)

In [None]:
bert.cuda();

In [None]:
optimizer = torch.optim.AdamW(bert.parameters(), lr=1e-3)


iteration_steps = 0
optimization_steps = 0
gradient_accumulation = 2

for input_ids in data:
    out = bert(input_ids=input_ids.cuda(), labels=input_ids.cuda())
    loss = out.loss

    loss = loss / gradient_accumulation # To average the gradient, otherwise it performs summation.
    loss.backward()

    iteration_steps += 1

    if (iteration_steps % gradient_accumulation) == 0:
        optimizer.step()
        optimizer.zero_grad()
        optimization_steps += 1

print("Total number of data iterations:", iteration_steps)
print("Total number of opimization steps:", optimization_steps)

# GPT2-Large memory requirements

Make sure to free the cuda memory before running this (you can relaunch the notebook for instance).

In [None]:
from transformers import AutoModelForCausalLM

In [None]:
gpt = AutoModelForCausalLM.from_pretrained("openai-community/gpt2-large")


In [None]:
gpt = gpt.cuda();
num_params = sum(p.numel() for p in gpt.parameters())
print(f"The number of parameters of GPT2-Large is: {num_params}")

In [None]:
!nvidia-smi

# Attention quadratic
relaunch the notebook as well

In [None]:
from transformers import GPT2Config, GPT2LMHeadModel
import torch

In [None]:
gpt_config = GPT2Config(**{
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 1,
  "embd_pdrop": 0.1,
  "eos_token_id": 10000,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 256,
  "n_head": 4,
  "n_layer": 6,
  "n_positions": 8000,
  "resid_pdrop": 0.1,
  "vocab_size": 10000
})

In [None]:
gpt = GPT2LMHeadModel(gpt_config)

In [None]:
data = []
list_lengths = [512, 1024, 2048, 4096]
for L in list_lengths:
    input_ids = torch.randint(0, 10000, (1, L))
    data.append(input_ids)

In [None]:
gpt.cuda();

In [None]:
import matplotlib.pyplot as plt

In [None]:
memory_cost = []
for input_ids in data:
    torch.cuda.empty_cache()
    out = gpt(input_ids=input_ids.cuda(), labels=input_ids.cuda())
    memory_cost.append(torch.cuda.memory_allocated())

plt.figure(figsize=(8, 6))
plt.plot(list_lengths, memory_cost, linestyle='-')

# Adding labels and title
plt.xlabel('Input Sequence Length')
plt.ylabel('Memory Consumption (MB)')
plt.title('GPT2 Memory Consumption vs Input Sequence Length')

# Adding grid for better readability
plt.grid(True)
plt.show()

In [None]:
%%capture
pip install evaluate;

In [None]:
import nltk
import matplotlib.pyplot as plt
import evaluate


In [None]:
bleu = evaluate.load("bleu")

In [None]:
def compute_bleu(reference, candidate):
    # Tokenize the reference and candidate texts

    # Calculate BLEU score
    bleu_score = bleu.compute(references=[[reference]], predictions=[candidate])

    return bleu_score["bleu"]


In [None]:

# Example usage:
reference_text = "The quick brown fox jumps over the lazy dog."
candidate_text = "The quick pink fox jumps over the sleeping dog."
bleu_score = compute_bleu(reference_text, candidate_text)
print(f"BLEU Score: {bleu_score}")