In [None]:
# Hugging Face - Fine-Tuning CodeT5 for Predicting If Statements

""" This notebook finetunes the pre-trained CodeT5 model to predict if statements """

# ------------------------
# 1. Install Required Libraries
# ------------------------
!pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124
!pip install transformers datasets evaluate -q

Looking in indexes: https://download.pytorch.org/whl/cu124
Collecting torch==2.5.1
  Downloading https://download.pytorch.org/whl/cu124/torch-2.5.1%2Bcu124-cp311-cp311-linux_x86_64.whl (908.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m908.3/908.3 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.20.1
  Downloading https://download.pytorch.org/whl/cu124/torchvision-0.20.1%2Bcu124-cp311-cp311-linux_x86_64.whl (7.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio==2.5.1
  Downloading https://download.pytorch.org/whl/cu124/torchaudio-2.5.1%2Bcu124-cp311-cp311-linux_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.5.1)
  Downloading https://download.pytorch.org/whl/cu124/nvidia_cuda_nvrtc_cu1

In [None]:
# ------------------------------------------------------------------------
# 2. Load Datasets and Create DatasetDict
# ------------------------------------------------------------------------
import pandas as pd
from datasets import Dataset
from datasets import DatasetDict

train_df = pd.read_csv("ft_train_masked.csv")
train_df.drop(columns=["Unnamed: 0"], inplace=True)

val_df = pd.read_csv("ft_valid_masked.csv")
val_df.drop(columns=["Unnamed: 0"], inplace=True)

test_df = pd.read_csv("ft_test_masked.csv")
test_df.drop(columns=["Unnamed: 0"], inplace=True)

# create dataset dict from data (so that it can be passed to the trainer later)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

In [None]:
# ------------------------------------------------------------------------
# 3. Load Pre-trained Model & Tokenizer
# ------------------------------------------------------------------------
from transformers import T5ForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import RobertaTokenizer
from datasets import DatasetDict
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback

model_checkpoint = "Salesforce/codet5-small"

model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)

# add tokens <MASK> and <TAB>
tokenizer.add_tokens(["<MASK>"])
tokenizer.add_tokens(["<TAB>"]) # only if working with inputs that have <TAB>

model.resize_token_embeddings(len(tokenizer))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(32102, 512)

In [None]:
# ------------------------------------------------------------------------------------------------
# 4. We prepare now the fine-tuning dataset using the tokenizer we preloaded
# ------------------------------------------------------------------------------------------------

def preprocess_function(examples):
    # inputs = examples["masked_no_tab"]
    inputs = examples["masked_with_tab"]
    targets = examples["target_block"]

    # tokenize the inputs
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")

    # tokenize the target
    labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [None]:
# ------------------------------------------------------------------------
# 5. Define Training Arguments and Trainer
# ------------------------------------------------------------------------

training_args = TrainingArguments(
    output_dir="./codet5-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch", # creates checkpoint every epoch
    logging_dir="./logs",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=6,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=2,
    logging_steps=100,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] # stop training when loss increases twice on the validation set
)

  trainer = Trainer(


In [None]:
# ------------------------
# 6. Train the Model
# ------------------------
trainer.train()

# save the model to a folder called final_model
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrzheng02[0m ([33mrzheng02-william-mary[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.0321,0.024743
2,0.0228,0.022624
3,0.0208,0.022346
4,0.0172,0.02269


Epoch,Training Loss,Validation Loss
1,0.0321,0.024743
2,0.0228,0.022624
3,0.0208,0.022346
4,0.0172,0.02269
5,0.0119,0.022752


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


('./final_model/tokenizer_config.json',
 './final_model/special_tokens_map.json',
 './final_model/vocab.json',
 './final_model/merges.txt',
 './final_model/added_tokens.json')

In [None]:
# compress the final model and download locally
!zip -r final_model.zip ./final_model
from google.colab import files
files.download("final_model.zip")

  adding: final_model/ (stored 0%)
  adding: final_model/special_tokens_map.json (deflated 97%)
  adding: final_model/vocab.json (deflated 67%)
  adding: final_model/merges.txt (deflated 54%)
  adding: final_model/training_args.bin (deflated 51%)
  adding: final_model/generation_config.json (deflated 33%)
  adding: final_model/added_tokens.json (deflated 20%)
  adding: final_model/tokenizer_config.json (deflated 94%)
  adding: final_model/config.json (deflated 62%)
  adding: final_model/model.safetensors (deflated 7%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>