In [2]:
import torch
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration, AdamW, AutoTokenizer
from datasets import load_dataset

In [5]:
# Load Multi30k dataset
# dataset = load_dataset("bentrevett/multi30k", split="train[:10000]")  # Load only a subset for demonstration
dataset = load_dataset("bentrevett/multi30k")  # Load only a subset for demonstration

Downloading readme:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 4.60M/4.60M [00:01<00:00, 2.46MB/s]
Downloading data: 100%|██████████| 164k/164k [00:00<00:00, 419kB/s]
Downloading data: 100%|██████████| 156k/156k [00:00<00:00, 177kB/s]


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [15]:
!pip list

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Package                   Version
------------------------- ------------------
absl-py                   2.1.0
addict                    2.4.0
aiohttp                   3.9.3
aiosignal                 1.3.1
arrow                     1.3.0
asttokens                 2.4.1
async-timeout             4.0.3
attrs                     23.2.0
backcall                  0.2.0
black                     24.2.0
blinker                   1.7.0
certifi                   2024.2.2
charset-normalizer        2.0.4
chumpy                    0.70
click                     8.1.7
clip                      1.0
colorama                  0.4.6
comm                      0.2.1
ConfigArgParse            1.7
contourpy                 1.2.0
cycler                    0.12.1
dash                      2.15.0
dash-core-components      2.0.0
dash-html-components      2.0.0
dash-table                5.0.0
datasets                  2.18.0
DateTime                  5.4
debugpy                   1.6.7
decorator               

In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'de', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
})

In [8]:
dataset["train"]

Dataset({
    features: ['en', 'de'],
    num_rows: 29000
})

In [3]:
# Initialize T5 tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [7]:
# def tokenize_data(example):
#     source_text = example["en"]
#     target_text = example["de"]
#     tokenized_inputs = tokenizer(
#         source_text,
#         padding="max_length",
#         truncation=True,
#         max_length=128,
#         return_tensors="pt"
#     )
#     tokenized_targets = tokenizer(
#         target_text,
#         padding="max_length",
#         truncation=True,
#         max_length=128,
#         return_tensors="pt"
#     )
#     return {
#         "input_ids": tokenized_inputs.input_ids.flatten(),
#         "attention_mask": tokenized_inputs.attention_mask.flatten(),
#         "labels": tokenized_targets.input_ids.flatten(),
#         "labels_attention_mask": tokenized_targets.attention_mask.flatten(),
#     }


# def tokenize_data(example):
#     source_text = example["en"]
#     target_text = example["de"]
#     tokenized_inputs = tokenizer(
#         source_text,
#         padding="max_length",
#         truncation=True,
#         max_length=128,
#         return_tensors="pt"
#     )
#     tokenized_targets = tokenizer(
#         target_text,
#         padding="max_length",
#         truncation=True,
#         max_length=128,
#         return_tensors="pt"
#     )
#     return {
#         "input_ids": tokenized_inputs.input_ids[0],
#         "attention_mask": tokenized_inputs.attention_mask[0],
#         "labels": tokenized_targets.input_ids[0],
#         "labels_attention_mask": tokenized_targets.attention_mask[0],
#     }


# Tokenize and preprocess data
def tokenize_data(batch):
    src_texts = batch["en"]
    tgt_texts = batch["de"]
    tokenized_batch = tokenizer.prepare_seq2seq_batch(src_texts, tgt_texts, truncation=True, padding="max_length", max_length=128, return_tensors="pt")
    return {
        "input_ids": tokenized_batch.input_ids,
        "attention_mask": tokenized_batch.attention_mask,
        "labels": tokenized_batch.labels,
    }

In [11]:
dataset = dataset.map(tokenize_data, batched=True)
train_dataloader = DataLoader(dataset['train'], batch_size=1, shuffle=True)

Map:   0%|          | 0/29000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1014 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [12]:
# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [13]:
optimizer = AdamW(model.parameters(), lr=1e-4)

# for epoch in range(3):  # Adjust number of epochs as needed
#     for batch in train_dataloader:
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         labels = batch["labels"].to(device)

#         optimizer.zero_grad()
#         outputs = model(
#             input_ids=input_ids,
#             attention_mask=attention_mask,
#             labels=labels,
#         )
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()

#         print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

for epoch in range(3):  # Adjust number of epochs as needed
    for batch in train_dataloader:
        # Convert each sequence in the batch to tensors and move them to device
        input_ids = torch.tensor([item for sublist in batch["input_ids"] for item in sublist]).to(device)
        attention_mask = torch.tensor([item for sublist in batch["attention_mask"] for item in sublist]).to(device)
        labels = torch.tensor([item for sublist in batch["labels"] for item in sublist]).to(device)
        
        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids.unsqueeze(0),  
            attention_mask=attention_mask.unsqueeze(0),  
            labels=labels.unsqueeze(0),  
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

Epoch 1, Loss: 15.840818405151367


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB (GPU 0; 3.81 GiB total capacity; 2.92 GiB already allocated; 20.06 MiB free; 2.97 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# Save trained model
model.save_pretrained("t5_multi30k_translation_model")

In [4]:
torch.save(model.state_dict(), 'model_weights.pth')