In [12]:
# !pip install transformers[sentencepiece] datasets
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
from datasets import load_dataset
import torch.nn as nn
from transformers import AutoTokenizer
import numpy as np
import warnings
import os
import logging
from transformer_block import Transformer
warnings.filterwarnings('ignore')
logging.disable(logging.WARNING)
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

save_path = r'D:\00mydataset\huggingface dataset'
model_path = r'D:\00mydataset\huggingface model'

max_length = 512
model = Transformer(59514, 59514, 784, 7, 2)
raw_datasets = load_dataset('news_commentary', 'en-fr', cache_dir=save_path)
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', return_tensors="pt")
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)


Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, pad=max_length, max_length=max_length, truncation=True)

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, pad=max_length, max_length=max_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,)

tokenized_datasets["validation"] = tokenized_datasets.pop("test")

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW

tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator, # 就是上面的DataCollatorForSeq2Seq(tokenizer, model=model)
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)


optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

In [None]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):

  print(f'-----------{epoch} epoch---------')
  # Training
  model.train()
  train_loss = 0
  valid_loss = 0
  for batch in train_dataloader:
      labels = batch['labels'].masked_fill(batch['labels'] == -100, 0)
      y = batch['labels']
      x = batch['input_ids']

      outputs = model(x, labels)
      # print(outputs, batch['labels'])
      loss = criterion(outputs.view(-1, outputs.shape[-1]), y.view(-1))
      train_loss += loss
      loss.backward()

      optimizer.step()
      lr_scheduler.step()
      optimizer.zero_grad()
      progress_bar.update(1)
      
  print(f'{epoch} train loss {train_loss/len(train_dataloader)}')
  
  # Evaluation
  model.eval()
  for batch in tqdm(eval_dataloader):
      with torch.no_grad():
          labels = batch['labels'].masked_fill(batch['labels'] == -100, 0)
          y = batch['labels']
          x = batch['input_ids']
          outputs = model(x,labels)
          loss = criterion(outputs.view(-1, outputs.shape[-1]), y.view(-1))
          valid_loss += loss
      
  print(f'{epoch} valid loss {train_loss/len(eval_dataloader)}')
    