### Install the necessary libraries

In [1]:
!pip install datasets transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Load the necessary libraries

In [27]:
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, SummarizationPipeline
from transformers import AutoTokenizer, AutoModelWithLMHead, Text2TextGenerationPipeline
import numpy as np
import json

### Load the model and its tokenizer libraries

In [3]:
model_checkpoint = "SEBIS/code_trans_t5_small_code_comment_generation_java_transfer_learning_finetune"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

### Load the evaluation metric

In [4]:
bleu_metric = load_metric("bleu")

### Load dataset

In [61]:
with open("data.json") as f:
  data = json.load(f)

codes = []
comments = []
for sample in data:
  codes.append(sample["method_text"])
  comments.append(sample["comment_text"])

### Tokenize the dataset

In [62]:
max_input_length = 512
max_target_length = 512
source_input = "code"
target_output = "comment"

def preprocess_function(examples):
    inputs = examples[source_input]
    targets = examples[target_output]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [63]:
def get_tokenized_datasets(codes, comments, train, val, test):
  no_data = len(codes)

  train_data = []
  for i in range(0, int(no_data*train)):
    train_data.append(preprocess_function({"code":codes[i], "comment":comments[i]}))
  #train_data = preprocess_function(train_data)

  val_data = []
  for i in range(int(no_data*train), int(no_data*(train + val))):
    val_data.append(preprocess_function({"code":codes[i], "comment":comments[i]}))
  #val_data = preprocess_function(val_data)

  test_data = []
  for i in range(int(no_data*(train + val)), int(no_data*(train + val + test))):
    test_data.append(preprocess_function({"code":codes[i], "comment":comments[i]}))
  #test_data = preprocess_function(test_data)
  return {"train":train_data, "validation":val_data, "test":test_data}

In [64]:
tokenized_datasets = get_tokenized_datasets(codes, comments, 0.8, 0.1, 0.1)

### Define the training arguments

In [83]:
batch_size = 8
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_input}-to-{target_output}",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    warmup_ratio=0.1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=False,
    fp16_opt_level="02",
    push_to_hub=False,
    gradient_accumulation_steps=32,
    seed=42,
    load_best_model_at_end=True,
    metric_for_best_model="eval_bleu",
    greater_is_better=True,
    save_strategy="epoch"
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


### Create the data collator for the inputs/outputs batching

In [84]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

### Create the evaluation metric function

In [85]:
def postprocess_text(preds, labels):
    preds = [pred.strip().split() for pred in preds]
    labels = [[label.strip().split()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["bleu"]*100}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

### Create the trainer based on the above declarations and functions

In [86]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

### Start training

In [87]:
trainer.train()

***** Running training *****
  Num examples = 4292
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 32
  Total optimization steps = 160


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
0,No log,4.65736,1.2767,16.3302
1,No log,3.504279,5.762,17.9496
2,No log,3.038161,6.8499,17.7593
3,No log,2.816769,7.9873,17.9254
4,No log,2.716589,7.4454,16.8769
5,No log,2.639648,7.5503,16.5896
6,No log,2.605978,7.7567,16.8041
7,No log,2.577969,7.8084,16.709
8,No log,2.556531,7.9767,16.9366
9,No log,2.550891,7.8763,16.7108


***** Running Evaluation *****
  Num examples = 536
  Batch size = 8
Saving model checkpoint to code_trans_t5_small_code_comment_generation_java_transfer_learning_finetune-finetuned-code-to-comment/checkpoint-16
Configuration saved in code_trans_t5_small_code_comment_generation_java_transfer_learning_finetune-finetuned-code-to-comment/checkpoint-16/config.json
Model weights saved in code_trans_t5_small_code_comment_generation_java_transfer_learning_finetune-finetuned-code-to-comment/checkpoint-16/pytorch_model.bin
tokenizer config file saved in code_trans_t5_small_code_comment_generation_java_transfer_learning_finetune-finetuned-code-to-comment/checkpoint-16/tokenizer_config.json
Special tokens file saved in code_trans_t5_small_code_comment_generation_java_transfer_learning_finetune-finetuned-code-to-comment/checkpoint-16/special_tokens_map.json
Copy vocab file to code_trans_t5_small_code_comment_generation_java_transfer_learning_finetune-finetuned-code-to-comment/checkpoint-16/spiece.

TrainOutput(global_step=160, training_loss=4.013628387451172, metrics={'train_runtime': 2823.3089, 'train_samples_per_second': 15.202, 'train_steps_per_second': 0.057, 'total_flos': 3886639061532672.0, 'train_loss': 4.013628387451172, 'epoch': 9.95})

### Create inference pipeline

In [88]:
original_pipeline = SummarizationPipeline(
    model=model,
    tokenizer=tokenizer,
    device=0
)

In [89]:
pipeline = SummarizationPipeline(
    model=trainer.model,
    tokenizer=tokenizer,
    device=0
)

### Make prediction for a single example on the test tdataset

In [90]:
!pip install javalang
import javalang

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [91]:
def tokenize_java_code(code):
    tokenList = []
    tokens = list(javalang.tokenizer.tokenize(code))
    for token in tokens:
        tokenList.append(token.value)
    
    return ' '.join(tokenList)

In [102]:
code = codes[5010]

In [104]:
tokenized_code = tokenize_java_code(code)
print("Output after tokenization: " + tokenized_code)

Output after tokenization: void debugPrintln ( String msg ) { if ( DEBUG ) { System . err . println ( "XERCES: " + msg ) ; } }


In [103]:
print(comments[5010])

* Prints a message to standard error if debugging is enabled. 


In [105]:
print(original_pipeline([tokenized_code])) # original model

[{'summary_text': 'Prints a message to System.err .'}]


In [106]:
print(pipeline([tokenized_code])) # fine tuned model

[{'summary_text': 'Prints a message to System.err .'}]
