In [1]:
from datasets import load_dataset

datasets = load_dataset('code_x_glue_cc_code_to_code_trans')

In [2]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'java', 'cs'],
        num_rows: 10300
    })
    validation: Dataset({
        features: ['id', 'java', 'cs'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'java', 'cs'],
        num_rows: 1000
    })
})

In [3]:
load = 'Salesforce/codet5p-220m'

max_source_len = 100
max_target_len = 100

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(load)

In [5]:
def preprocess_function(examples):
    source = examples["java"]
    target = examples["cs"]

    model_inputs = tokenizer(source, max_length=max_source_len, padding="max_length", truncation=True)
    labels = tokenizer(target, max_length=max_target_len, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"].copy()
    model_inputs["labels"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in model_inputs["labels"]
    ]
    return model_inputs

tokenized_datasets = datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=datasets['train'].column_names,
    # num_proc=64,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/10300 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/500 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [6]:
tokenized_datasets['train']

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 10300
})

In [7]:
from transformers import AutoConfig, AutoModelForSeq2SeqLM

config = AutoConfig.from_pretrained('google-t5/t5-base')
model = AutoModelForSeq2SeqLM.from_config(config)



In [13]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    "our_code_trans",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=50,
    save_total_limit=3,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)



In [15]:
trainer.train("our_code_trans/checkpoint-10000")

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss
8,1.7562,1.663992
9,1.6123,1.574033
10,1.4753,1.484235
11,1.3676,1.425126
12,1.2443,1.375041
13,1.1426,1.325438
14,1.0671,1.281399
15,1.0134,1.256428
16,0.9439,1.234827
17,0.8703,1.212541


KeyboardInterrupt: 