<a href="https://colab.research.google.com/github/skyfuryonline/semeval/blob/master/latest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
! pip install datasets



In [17]:
from datasets import Dataset
from transformers import AutoTokenizer

# 示例数据
# data = {
#         "source_locale": ["en","en"],
#         "target_locale": ["fr","fr"],
#         "source": ["How many Star Wars movies has Chewbacca appeared in?","'Good!' said Norbert, 'there's one of my father's spies gone; that leaves only the little cripple Napier.'"],
#         "target": ["Combien de films Star Wars Chewbacca est-il apparu?",'– Bon! dit Norbert, voilà un des espions de mon père parti ; il ne reste plus que le petit boiteux Napier.'],
# }
from datasets import load_dataset

data = load_dataset("opus_books", "en-fr")
data = data['train'].train_test_split(test_size=0.2)

train_data = {
    "source_locale": [],
    "target_locale": [],
    "source":[],
    "target":[],
}
test_data = {
    "source_locale": [],
    "target_locale": [],
    "source":[],
    "target":[],
}

cnt = 1
for item in data['train']:
  for lang,sentence in item['translation'].items():
    if cnt %2 != 0:
      train_data['source_locale'].append(lang)
      train_data['source'].append(sentence)
    else:
      train_data['target_locale'].append(lang)
      train_data['target'].append(sentence)
    cnt += 1

cnt = 1
for item in data['test']:
  for lang,sentence in item['translation'].items():
    if cnt %2 != 0:
      test_data['source_locale'].append(lang)
      test_data['source'].append(sentence)
    else:
      test_data['target_locale'].append(lang)
      test_data['target'].append(sentence)
    cnt += 1



In [18]:
# 创建Dataset对象
train_data = Dataset.from_dict(train_data)
test_data = Dataset.from_dict(test_data)
print(train_data[0])
print(test_data[0])

{'source_locale': 'en', 'target_locale': 'fr', 'source': 'What a stroke was this for poor Jane! who would willingly have gone through the world without believing that so much wickedness existed in the whole race of mankind, as was here collected in one individual.', 'target': 'Quel coup pour la pauvre Jane qui aurait parcouru le monde entier sans s’imaginer qu’il existât dans toute l’humanité autant de noirceur qu’elle en découvrait en ce moment dans un seul homme !'}
{'source_locale': 'en', 'target_locale': 'fr', 'source': '"Not if it were my own brother!" cried d’Artagnan, as if carried away by his enthusiasm.', 'target': "«Non, fût-ce mon frère!» s'écria d'Artagnan comme emporté par l'enthousiasme."}


In [19]:
column_name = train_data.column_names
print(column_name)
# 加载 T5 模型的分词器
model_id = "google-t5/t5-small"  # 使用 T5-small 模型
tokenizer = AutoTokenizer.from_pretrained(model_id)

['source_locale', 'target_locale', 'source', 'target']


In [20]:
# 任务说明和输入格式
prefix = "translate English to French: "

# 预处理函数
def preprocess_function(examples):
    # 为每个句子添加任务描述前缀
    inputs = [prefix + example for example in examples['source']]
    targets = [example for example in examples['target']]
    # 使用 T5 分词器对输入和目标进行编码
    model_inputs = tokenizer(inputs, text_target=targets, max_length=10, truncation=True, padding="max_length")
    return model_inputs

# 预处理数据
tokenized_train = train_data.map(preprocess_function, batched=True,remove_columns=column_name)
tokenized_test = test_data.map(preprocess_function,batched=True,remove_columns=column_name)
# 查看预处理后的数据
print(tokenized_train[0])
print(tokenized_test[0])

Map:   0%|          | 0/101668 [00:00<?, ? examples/s]

Map:   0%|          | 0/25417 [00:00<?, ? examples/s]

{'input_ids': [13959, 1566, 12, 2379, 10, 363, 3, 9, 9529, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [12281, 4897, 171, 50, 24380, 8158, 285, 3, 12463, 1]}
{'input_ids': [13959, 1566, 12, 2379, 10, 96, 10358, 3, 99, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [673, 567, 106, 6, 3, 89, 10443, 17, 18, 1]}


In [21]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_id)

In [22]:
! pip install evaluate
! pip install sacrebleu
import evaluate
metric = evaluate.load("sacrebleu")



In [28]:
import numpy as np


def postprocess_text(preds, labels):
   # 确保 preds 和 labels 是列表，并去除空白字符
    preds = [pred.strip() for pred in preds]
    # BLEU 需要 label 是嵌套列表
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [24]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [29]:

tokenized_train_slice =Dataset.from_dict(tokenized_train[0:500])
tokenized_test_slice = Dataset.from_dict(tokenized_test[0:500])

training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_opus_books_model",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    # save_total_limit=3,
    num_train_epochs=3,
    # predict_with_generate=True,
    fp16=True,
    # push_to_hub=True,
    report_to ="none",
    gradient_checkpointing=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    # train_dataset=tokenized_train,
    # eval_dataset=tokenized_test,
    train_dataset=tokenized_train_slice,
    eval_dataset=tokenized_test_slice,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Seq2SeqTrainer(


Step,Training Loss,Validation Loss


(array([[[-10.3125    ,  -7.09375   , -10.046875  , ..., -40.15625   ,
         -40.3125    , -40.3125    ],
        [-21.796875  ,  -7.4492188 ,  -7.0820312 , ..., -44.40625   ,
         -44.59375   , -44.59375   ],
        [-21.109375  , -14.2265625 ,  -8.890625  , ..., -50.90625   ,
         -51.03125   , -51.125     ],
        ...,
        [-31.0625    ,  -8.796875  , -13.1796875 , ..., -48.125     ,
         -48.34375   , -48.34375   ],
        [-29.109375  ,  -4.4960938 ,  -7.3789062 , ..., -42.125     ,
         -42.40625   , -42.34375   ],
        [-30.21875   ,  -4.0664062 ,  -9.3515625 , ..., -45.6875    ,
         -45.78125   , -45.75      ]],

       [[ -8.4453125 ,  -6.5664062 , -10.6640625 , ..., -39.75      ,
         -39.875     , -39.8125    ],
        [-19.390625  ,  -5.515625  , -12.1796875 , ..., -42.5       ,
         -42.6875    , -42.625     ],
        [-19.890625  ,  -4.6640625 ,  -9.8515625 , ..., -40.90625   ,
         -40.96875   , -41.03125   ],
        ...,

KeyboardInterrupt: Interrupted by user

In [None]:
text = "translate English to French: hello"

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id)
inputs = tokenizer(text, return_tensors="pt").input_ids
inputs = inputs.to("cuda")

In [None]:
from transformers import AutoModelForSeq2SeqLM
outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
tokenizer.decode(outputs[0], skip_special_tokens=True)