# Colab Initialization

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!git clone https://github.com/radistoubalidis/JSRepair.git

!python -m pip install lightning
!pip install datasets
!pip install python-dotenv
!pip install rouge-score
!pip install diff-match-patch

Cloning into 'JSRepair'...
remote: Enumerating objects: 570, done.[K
remote: Counting objects: 100% (86/86), done.[K
remote: Compressing objects: 100% (63/63), done.[K
remote: Total 570 (delta 58), reused 45 (delta 23), pack-reused 484 (from 1)[K
Receiving objects: 100% (570/570), 2.16 MiB | 11.32 MiB/s, done.
Resolving deltas: 100% (390/390), done.
Collecting lightning
  Downloading lightning-2.5.0.post0-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.4/40.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities<2.0,>=0.10.0 (from lightning)
  Downloading lightning_utilities-0.11.9-py3-none-any.whl.metadata (5.2 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning)
  Downloading torchmetrics-1.6.1-py3-none-any.whl.metadata (21 kB)
Collecting pytorch-lightning (from lightning)
  Downloading pytorch_lightning-2.5.0.post0-py3-none-any.whl.metadata (21 kB)
Downloading lightning-2.5.0.post0-py3-none-any.whl (

In [3]:
%cd ./JSRepair

/content/JSRepair


# Dependencies

In [4]:
from transformers import RobertaTokenizer
from sklearn.model_selection import train_test_split
from modules.TrainConfig import init_checkpoint, init_logger, Trainer
from modules.models import CodeBertJS
from modules.datasets import CodeBertDataset
from modules.filters import add_labels
from torch.utils.data import DataLoader
from lightning import Trainer as plTrainer
from modules.filters import get_changed_token_indices
from diff_match_patch import diff_match_patch
from typing import List
import matplotlib.pyplot as plt
import os
import sqlite3
import pandas as pd
import numpy as np
import os
import torch
import random

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/504 [00:00<?, ?B/s]

In [5]:
HF_DIR = 'microsoft/codebert-base-mlm'
TOKENIZER_MAX_LENGTH = 512
DB_PATH = 'commitpack-datasets.db' if os.path.exists('commitpack-datasets.db') else '/content/drive/MyDrive/Thesis/commitpack-datasets.db'
DB_TABLE = 'commitpackft_classified_train'
if not os.path.exists(DB_PATH):
    raise RuntimeError('sqlite3 path doesnt exist.')
VAL_SIZE = 0.2
LOG_PATH = 'logs' if os.path.exists('logs') else '/content/drive/MyDrive/Thesis/logs'
VERSION = int(input('Training version: '))
LOAD_FROM_CPKT = input("Load from existing model (type cpkt path if true): ")
DEBUG = True if int(input('Debug Run (1,0): ')) == 1 else False
BATCH_SIZE = 8 if DEBUG else 64
CPKT_PATH = 'checkpoints' if os.path.exists('checkpoints') else '/content/drive/MyDrive/Thesis/checkpoints'
DROPOUT_RATE = float(input('Type dropout rate for classifier: '))
NUM_EPOCHS = int(input('Type number of train epochs: '))
WITH_LAYER_NORM = True
WITH_ACTIVATION = True
try:
    LEARNING_RATE = float(input("Type initial lr,(default = 1e-3): "))
except:
    LEARNING_RATE = 1e-3
BIMODAL_TRAIN = True if int(input('Combine commit messages with codes (1,0): ')) == 1 else False
NEW_CPKT = True if int(input('Save to new checkpoint path (1,0): ')) == 1 else False

classLabels = {
    "functionality" : 0.,
    "ui-ux" : 0.,
    "compatibility-performance" : 0.,
    "network-security" : 0.,
    "general": 0.
}

num_classes = len(classLabels.keys())

modelSize = HF_DIR.split('/')[-1]
MODEL_DIR = f"CodeBert_{modelSize}_JS_{num_classes}_classes_{TOKENIZER_MAX_LENGTH}MaxL"
con = sqlite3.connect(DB_PATH)

Training version: 602
Load from existing model (type cpkt path if true): 
Debug Run (1,0): 0
Type dropout rate for classifier: 0.5
Type number of train epochs: 10
Type initial lr,(default = 1e-3): 
Combine commit messages with codes (1,0): 0
Save to new checkpoint path (1,0): 1


# Training

## Create Classification Labels
```json
{
    "mobile" : 0,
    "functionality" : 0,
    "ui-ux" : 0,
    "compatibility-performance" : 0,
    "network-security" : 0,
    "general": 0
}

Ένα δείγμα που κατηγοριοποιήθηκε ως σφάλμα λειτουργικότητας(functionality) και ui-ux θα έχει διάνυσμα ταξινόμησης ->
[0,1,1,0,0,0]
```

In [6]:
ds_df = pd.read_sql_query("select * from commitpackft_classified_train", con)
ds_df['class_labels'] = ds_df['bug_type'].apply(lambda bT: add_labels(bT.split(','), classLabels))

ds_df = ds_df[ds_df['bug_type'] != 'mobile']
ds_df = ds_df[ds_df['old_contents'].str.len() > 0]

if DEBUG:
    ds_df = ds_df.sample(20)

## Data Preprocess

### Filter out outlier

In [7]:
def count_comment_lines(sample: str) -> int:
    comment_blocks = []
    start_index = -1
    for i, line in enumerate(sample.splitlines()):
        if line.strip().startswith('/*'):
            start_index = i
        elif line.strip().endswith('*/'):
            comment_blocks.append([start_index, i])
            start_index = -1

    comment_lines_count = sum([c[1]-c[0] for c in comment_blocks])

    for i, line in enumerate(sample.splitlines()):
        if line.strip().startswith('//'):
            comment_lines_count += 1
    return comment_lines_count

ds_df['old_contents_comment_lines_count'] = ds_df['old_contents'].apply(lambda sample: count_comment_lines(sample))
ds_df['new_contents_comment_lines_count'] = ds_df['new_contents'].apply(lambda sample: count_comment_lines(sample))

# Filter out samples where the sum of comment lines increased more than 3 lines
# to prevent excessive masking
ds_df = ds_df[abs(ds_df['old_contents_comment_lines_count'] - ds_df['new_contents_comment_lines_count']) <= 3]
# Filter out samples with more than 10 comment lines
ds_df = ds_df[(ds_df['old_contents_comment_lines_count'] < 10) & (ds_df['new_contents_comment_lines_count'] < 10)]

dmp = diff_match_patch()

def compute_diffs(sample: dict):
    # Compute the differences
    diffs = dmp.diff_main(sample['old_contents'], sample['new_contents'])
    dmp.diff_cleanupSemantic(diffs)
    # Count the changes
    return sum(1 for diff in diffs if diff[0] == 1)  # 0 means no change

ds_df['num_changes'] = ds_df.apply(lambda sample: compute_diffs(sample), axis=1)

# Filter out samples with more than 3 changes in the code
ds_df = ds_df[ds_df['num_changes'] <= 3]

# filter out samples with more than 50 lines of code
ds_df = ds_df[(ds_df['old_contents'].str.count('\n') <= 50) & (ds_df['new_contents'].str.count('\n') <= 50)]

# filter out samples with more than 2 bug types
ds_df = ds_df[ds_df['bug_type'].str.count(',') < 2]

### Mask Input Sequences

In [8]:
tokenizer = RobertaTokenizer.from_pretrained(HF_DIR)

def mask(buggy_code: str, correct_code: str, tokenizer: RobertaTokenizer) -> str:
    """Η συναρτηση χρησιμοποιεί τον tokenizer του μοντελου,
    για να μετατρέψει τον κωδικα σε μια λιστα απο word tokens
    πανω στις οποιες εφαρμοζεται συγκριση χαρακτηρα προς χαρακτηρα
    για να βρεθουν τα σημεία στην λιστα που βρισκονταιοι διαφορες τους,
    ώστε να χρησιμοποιηθουν τα συγκεκριμενα στοιχεια της λιστας στον μηχανισμο
    αποκρυψης.
    Σεναριο 1: Αν το συνολο των word tokens που αλλαξαν δεν ειναι
    μεγαλύτερο απο το 1/4 του συνολου των word tokens τοτε εφαρμοζεται
    το στοιχειο μασκα του tokenizer στα word token που αλλαξαν.
    Σεναριο 2: Αν το συνολο των word tokens που αλλαξαν ειναι
    μεγαλύτερο απο το 1/4 του συνολου των word tokens τοτε εφαρμοζεται η μασκα στα word tokens με τυχαιο τροπο.
    Επιστρέφεται ο κώδικας σε μορφη string αφοτου εφαρμόστηκε η τεχνικη
    αποκρυψης στα επιλεγμενα στοιχεια.

    Αν σε ολα τα δειγματα εφαρμοζοταν η τεχνικη αποκρυψης
    με βαση τις διαφορες στον κωδικα θα δημιουργοταν θορυβος
    στο dataset με πολλα outlier δειγματα (π.χ. σε ενα δειγμα που
    υπήρχαν πολλές γραμμές με σχόλια και αφαιρεθηκαν, θα δημιουργοταν
    ενα δειγμα που το συνολο των word token του θα ηταν το στοιχείο αποκρυψης)

    Args:
        buggy_code (str): code before commit
        correct_code (str): code after commit
        tokenizer (RobertaTokenizer): codebert's tokenizer

    Returns:
        _type_: str
    """
    buggy_tokens = tokenizer.tokenize(buggy_code)
    correct_tokens = tokenizer.tokenize(correct_code)
    indices = get_changed_token_indices(buggy_tokens, correct_tokens)
    masked_buggy_tokens = buggy_tokens
    if len(indices) <= len(buggy_tokens) / 4:
        for i1, i2 in indices:
            if abs(i2-i1) == 0:
                masked_buggy_tokens[i1-1] = tokenizer.mask_token
            if abs(i2-i1) == 1:
                masked_buggy_tokens[i1] = tokenizer.mask_token
            else:
                for idx in range(i1,i2):
                    masked_buggy_tokens[idx] = tokenizer.mask_token
    else:
        num_random_masks = random.randint(1, int(len(buggy_tokens) / 4))
        random_indices = random.sample(range(1,len(buggy_tokens)), num_random_masks)
        for ri in random_indices:
            masked_buggy_tokens[ri] = tokenizer.mask_token
    return tokenizer.convert_tokens_to_string(masked_buggy_tokens)

ds_df['masked_old_contents'] = ds_df.apply(lambda row: mask(row['old_contents'], row['new_contents'], tokenizer), axis=1)

### Concatenate Commit Message with the old contents
- This way, the commit message is directly provided as additional context, and the models (T5, Bert) can process both the buggy code and the commit message in a unified manner.
- This approach will allow the model to learn the relationship between the commit message and the changes made to the code.

In [9]:
old_codes = ds_df[['message', 'masked_old_contents', 'class_labels']]
new_codes = ds_df[['message', 'new_contents', 'class_labels']]
if BIMODAL_TRAIN:
    old_codes['input_seq'] = '/* ' + old_codes['message'] + '*/\n' + tokenizer.sep_token + '\n' + old_codes['masked_old_contents']
else:
    old_codes['input_seq'] = old_codes['masked_old_contents'].copy()

new_codes['output_seq'] = new_codes['new_contents'].copy()
TRAIN_old, VAL_old, TRAIN_new, VAL_new = train_test_split(old_codes, new_codes, test_size=VAL_SIZE, random_state=42)

print(f"Total training samples: {len(TRAIN_old)}")
print(f"Total validation samples: {len(VAL_old)}")

Total training samples: 5310
Total validation samples: 1328


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  old_codes['input_seq'] = old_codes['masked_old_contents'].copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_codes['output_seq'] = new_codes['new_contents'].copy()


## Prepare Model Input

In [10]:
TOKENIZER_MAX_LENGTH = 512

TRAIN_encodings = tokenizer(
    TRAIN_old['input_seq'].tolist(),
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

VAL_encodings = tokenizer(
    VAL_old['input_seq'].tolist(),
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

TRAIN_gt_ids = tokenizer(
    TRAIN_new['output_seq'].tolist(),
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
).input_ids

VAL_gt_ids = tokenizer(
    VAL_new['output_seq'].tolist(),
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
).input_ids


TRAIN_classes = torch.tensor(TRAIN_old['class_labels'].tolist())
VAL_classes = torch.tensor(VAL_old['class_labels'].tolist())

TRAIN_dataset = CodeBertDataset(encodings=TRAIN_encodings, class_labels=TRAIN_classes, gt_input_ids=TRAIN_gt_ids)
VAL_dataset = CodeBertDataset(encodings=VAL_encodings, class_labels=VAL_classes, gt_input_ids=VAL_gt_ids)


# Class weights
# pos_weight[i] = (Number of negative samples for class i) / (Number of positive samples for class i)
num_samples = TRAIN_classes.size(0)
num_classes = TRAIN_classes.size(1)

pos_counts = torch.sum(TRAIN_classes, dim=0)
neg_counts = num_samples - pos_counts
class_weights = neg_counts / (pos_counts + 1e-6)
class_weights = class_weights.numpy()

## Initialize Training Settings

In [11]:
logger = init_logger(log_path=LOG_PATH, model_dir=MODEL_DIR, version=VERSION)
checkpoint = init_checkpoint(cpkt_path=CPKT_PATH, model_dir=MODEL_DIR, version=VERSION, targetMetric='val_auxilary_loss')
if DEBUG:
    trainer = Trainer(checkpoint=checkpoint,logger=logger,debug=DEBUG, num_epochs=NUM_EPOCHS)
else:
    trainer = Trainer(checkpoint=checkpoint,logger=logger,debug=DEBUG, num_epochs=NUM_EPOCHS, precision='32-true')

if len(LOAD_FROM_CPKT) > 0 and  os.path.exists(LOAD_FROM_CPKT):
    model = CodeBertJS.load_from_checkpoint(
        LOAD_FROM_CPKT,
        class_weights=class_weights,
        num_classes=num_classes,
        dropout_rate=DROPOUT_RATE,
        with_activation=WITH_ACTIVATION,
        with_layer_norm=WITH_LAYER_NORM,
        tokenizer=tokenizer,
        lr=LEARNING_RATE
    )
else:
    model = CodeBertJS(
        class_weights=class_weights,
        num_classes=num_classes,
        dropout_rate=DROPOUT_RATE,
        with_activation=WITH_ACTIVATION,
        with_layer_norm=WITH_LAYER_NORM,
        tokenizer=tokenizer
    )
model.codebert.train()
model.classifier.train()

if DEBUG:
    model.to('cpu')

dataloader = DataLoader(TRAIN_dataset, batch_size=BATCH_SIZE,num_workers=14, shuffle=True)
val_dataloader = DataLoader(VAL_dataset, batch_size=BATCH_SIZE, num_workers=14)

INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs


pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Run Training

In [12]:
if len(LOAD_FROM_CPKT) > 0 and os.path.exists(LOAD_FROM_CPKT) and not NEW_CPKT:
    trainer.fit(
        model,
        train_dataloaders=dataloader,
        val_dataloaders=val_dataloader,
        ckpt_path=LOAD_FROM_CPKT
    )
else:
    trainer.fit(
        model,
        train_dataloaders=dataloader,
        val_dataloaders=val_dataloader
    )

INFO: You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:lightning.pytorch.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /content/drive/MyDrive/Thesis/checkpoints exists and is not empty.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0

Sanity Checking: |          | 0/? [00:00<?, ?it/s]



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 0, global step 83: 'val_auxilary_loss' reached 2.94652 (best 2.94652), saving model to '/content/drive/MyDrive/Thesis/checkpoints/CodeBert_codebert-base-mlm_JS_5_classes_512MaxL_v602.ckpt' as top 2
INFO:lightning.pytorch.utilities.rank_zero:Epoch 0, global step 83: 'val_auxilary_loss' reached 2.94652 (best 2.94652), saving model to '/content/drive/MyDrive/Thesis/checkpoints/CodeBert_codebert-base-mlm_JS_5_classes_512MaxL_v602.ckpt' as top 2


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 1, global step 166: 'val_auxilary_loss' reached 2.93673 (best 2.93673), saving model to '/content/drive/MyDrive/Thesis/checkpoints/CodeBert_codebert-base-mlm_JS_5_classes_512MaxL_v602-v1.ckpt' as top 2
INFO:lightning.pytorch.utilities.rank_zero:Epoch 1, global step 166: 'val_auxilary_loss' reached 2.93673 (best 2.93673), saving model to '/content/drive/MyDrive/Thesis/checkpoints/CodeBert_codebert-base-mlm_JS_5_classes_512MaxL_v602-v1.ckpt' as top 2


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 2, global step 249: 'val_auxilary_loss' reached 2.93835 (best 2.93673), saving model to '/content/drive/MyDrive/Thesis/checkpoints/CodeBert_codebert-base-mlm_JS_5_classes_512MaxL_v602.ckpt' as top 2
INFO:lightning.pytorch.utilities.rank_zero:Epoch 2, global step 249: 'val_auxilary_loss' reached 2.93835 (best 2.93673), saving model to '/content/drive/MyDrive/Thesis/checkpoints/CodeBert_codebert-base-mlm_JS_5_classes_512MaxL_v602.ckpt' as top 2


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Epoch 3, global step 332: 'val_auxilary_loss' was not in top 2
INFO:lightning.pytorch.utilities.rank_zero:Epoch 3, global step 332: 'val_auxilary_loss' was not in top 2


### Save Model Config to CSV

In [13]:
if not DEBUG:
    import gspread
    from google.colab import auth
    from oauth2client.client import GoogleCredentials
    from google.auth import default

    auth.authenticate_user()
    creds, _ = default()
    gc = gspread.authorize(creds)

    spreadsheet = gc.open("model-configs").sheet1
    modelConfig = {
            'name': MODEL_DIR,
            'tokenizer_max_length': TOKENIZER_MAX_LENGTH,
            'num_classes': num_classes,
            'dropout_rate': DROPOUT_RATE,
            'with_activation': WITH_ACTIVATION,
            'with_layer_norm': WITH_LAYER_NORM,
            'learning_rate': LEARNING_RATE,
            'bimodal_train': BIMODAL_TRAIN,
            'version': VERSION
    }
    spreadsheet.append_row(list(modelConfig.values()))