### Part 1: Setup

In [66]:
from transformers import AutoModelForSeq2SeqLM
from transformers import RobertaTokenizer
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from datasets import load_dataset
from datasets import DatasetDict
import torch
import evaluate
from tqdm import tqdm

In [None]:
# 1. Install libraries

#%pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124
#%pip install transformers datasets evaluate -q
#%pip install pyarrow tree-sitter-python

In [67]:
# 2. Load dataset 

# Split the dataset into train, test, and validation sections
test_dataset = load_dataset("linyalan/python-bugs-name-noise-1", split="train[0:100]")
train_dataset = load_dataset("linyalan/python-bugs-name-noise-1", split="train[100:800]")
validation_dataset = load_dataset("linyalan/python-bugs-name-noise-1", split="train[800:1000]")

dataset = DatasetDict({
    'test': test_dataset,
    'train': train_dataset,
    'validation': validation_dataset
})
#print(dataset)
#print(dataset["train"])

### Part 2: Existing Model

In [68]:
# 1. Load existing model & tokenizer

model_checkpoint = "Salesforce/codet5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)

model.resize_token_embeddings(len(tokenizer))

Embedding(32100, 512)

In [69]:
# 2. Tokenize the dataset

def preprocess_function(dataset):
    inputs = dataset["prompt_code"]
    targets = dataset["correct_code"]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train = dataset["train"].map(preprocess_function, batched=True)
valid = dataset["validation"].map(preprocess_function, batched = True)
test = dataset["test"].map(preprocess_function, batched = True)
#print(valid)
#print(train)
#print(test)


In [70]:
# 3. Generate output from existing model

all_inputs = test["prompt_code"]
batch_size = 8
decoded_outputs = []

model2 = model.to('cuda')

for i in tqdm(range(0, len(all_inputs), batch_size)):
    batch = all_inputs[i:i+batch_size]

    # Tokenize batch
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to('cuda') for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model2.generate(**inputs, max_length=256)

    # Decode each output
    decoded_batch = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded_outputs.extend(decoded_batch)

100%|██████████| 13/13 [00:15<00:00,  1.22s/it]


In [71]:
# 4. calculate BLEU score for existing model

predictions = decoded_outputs
references = test["correct_code"]

bleu = evaluate.load("bleu")
bleu_score = bleu.compute(references=references, predictions=predictions)
print(bleu_score)

# score: 0.00348

{'bleu': 0.0034824889169004147, 'precisions': [0.5877890173410405, 0.344698388909704, 0.2625289128758674, 0.21284185493460167], 'brevity_penalty': 0.010676183234260015, 'length_ratio': 0.18051389070040433, 'translation_length': 2768, 'reference_length': 15334}


In [72]:
# 5. Example prediction from existing model

print("buggy code:",test["prompt_code"][8])
print("prediction:",decoded_outputs[8])
print("target:",test["correct_code"][8])

buggy code: def child_relationships(self, pid, **kwargs):
    """https://familysearch.org/developers/docs/api/tree/Relationships_to_Children_resource"""
    return self._add_query_params(
        self.person_base + pid / '/child-relationships', kwargs)
prediction: self.person_base
target: def child_relationships(self, pid, **kwargs):
    """https://familysearch.org/developers/docs/api/tree/Relationships_to_Children_resource"""
    return self._add_query_params(
        self.person_base + pid + '/child-relationships', kwargs)


### Part 3: Fine-tuning

In [None]:
# 1. Define fine-tuning training arguments and create Trainer

training_args = TrainingArguments(
    output_dir="./bugfixer-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=2,
    logging_steps=100,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=valid,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [83]:
# 2. Train the model

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0103,0.032018
2,0.0063,0.032591
3,0.006,0.029678
4,0.0037,0.029162
5,0.0037,0.028645


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=1750, training_loss=0.006436010794980186, metrics={'train_runtime': 282.1898, 'train_samples_per_second': 12.403, 'train_steps_per_second': 6.202, 'total_flos': 236848152576000.0, 'train_loss': 0.006436010794980186, 'epoch': 5.0})

In [84]:
# 3. Save the model and tokenizer so you don't have to retrain

save_path = "./bugfixer-finetuned"

trainer.save_model(save_path)

tokenizer.save_pretrained(save_path)

('./bugfixer-finetuned\\tokenizer_config.json',
 './bugfixer-finetuned\\special_tokens_map.json',
 './bugfixer-finetuned\\vocab.json',
 './bugfixer-finetuned\\merges.txt',
 './bugfixer-finetuned\\added_tokens.json')

In [85]:
# 4. Load saved model and tokenizer if applicable

save_path = "./bugfixer-finetuned"
model = AutoModelForSeq2SeqLM.from_pretrained(save_path)
tokenizer = RobertaTokenizer.from_pretrained(save_path)

In [86]:
# 5. Small test with fine-tuned model

model2 = model.to('cuda')
input_code = test["prompt_code"][0]
print(test["prompt_code"][0])
inputs = tokenizer(input_code, return_tensors="pt", padding=True, truncation=True)
outputs = model2.generate(**inputs.to('cuda'), max_length=256)
print(tokenizer.decode(outputs[0]))
model2.eval()

def place_types(self):
    """https://familysearch.org/developers/docs/api/places/Place_Types_resource"""
    return self.places_base / "types"
<pad><s>def place_types(self):
    """https://familysearch.org/developers/docs/api/places/Place_Types_resource"""
    return self.places_base + "types"</s>


T5ForConditionalGeneration(
  (shared): Embedding(32100, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [87]:
# 6. Full test with fine-tuned model

all_inputs = test["prompt_code"]
batch_size = 8
decoded_outputs = []

for i in tqdm(range(0, len(all_inputs), batch_size)):
    batch = all_inputs[i:i+batch_size]

    # Tokenize batch
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to('cuda') for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model2.generate(**inputs, max_length=256)

    # Decode each output
    decoded_batch = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded_outputs.extend(decoded_batch)

100%|██████████| 13/13 [00:47<00:00,  3.66s/it]


In [88]:
# 7. 3 examples of model output

for i in range(3):
    print("Input:",test["prompt_code"][i])
    print("Target:",test["correct_code"][i])
    print(f"Prediction: {decoded_outputs[i]}")

Input: def place_types(self):
    """https://familysearch.org/developers/docs/api/places/Place_Types_resource"""
    return self.places_base / "types"
Target: def place_types(self):
    """https://familysearch.org/developers/docs/api/places/Place_Types_resource"""
    return self.places_base + "types"
Prediction: def place_types(self):
    """https://familysearch.org/developers/docs/api/places/Place_Types_resource"""
    return self.places_base + "types"
Input: def zset_score_pairs(response, **options):
    """
    If ``withscores`` is specified in the options, return the response as
    a list of (value, score) pairs
    """
    if not response and not options['withscores']:
        return response
    score_cast_func = options.get('score_cast_func', float)
    it = iter(response)
    return list(izip(it, imap(score_cast_func, it)))
Target: def zset_score_pairs(response, **options):
    """
    If ``withscores`` is specified in the options, return the response as
    a list of (value,

In [None]:
# 8. Calculate BLEU score for fine-tuned model

predictions = decoded_outputs
references = test["correct_code"]

bleu = evaluate.load("bleu")
bleu_score = bleu.compute(references=references, predictions=predictions)
print(bleu_score)

# bleu score: 0.7142

{'bleu': 0.7141648707151963, 'precisions': [0.9963645806284082, 0.9924037370121366, 0.9888135294635779, 0.9851595130187506], 'brevity_penalty': 0.7208859761257439, 'length_ratio': 0.7534237641841659, 'translation_length': 11553, 'reference_length': 15334}
