In [3]:
import os
from transformers import RobertaTokenizer
from sklearn.model_selection import train_test_split
from modules.TrainConfig import init_checkpoint, init_logger, Trainer, masker
from modules.models import CodeBertJS
from modules.datasets import CodeBertDataset
from modules.filters import mask_code_diff
from torch.utils.data import DataLoader
import sqlite3
import pandas as pd
import json
import os

In [6]:
HF_DIR = 'microsoft/codebert-base-mlm'
TOKENIZER_MAX_LENGTH = int(input('Tokenizer Max length: '))
DB_PATH = input('Paste sqlite3 path: ')
DB_TABLE = 'commitpackft_classified_train'
if not os.path.exists(DB_PATH):
    raise RuntimeError('sqlite3 path doesnt exist.')
VAL_SIZE = 0.3
LOG_PATH = input('Paste Log path:')
MODEL_DIR = 'CodeBertJS'
VERSION = int(input('Training version: '))
LOAD_FROM_CPKT = input("Load from existing model (type cpkt path if true): ")
BATCH_SIZE = int(input('Batch size: '))
DEBUG = True if int(input('Debug Run (1,0): ')) == 1 else False
CPKT_PATH = input('Paste checkpoints dir: ')
MASK_PROB = float(input("Paste MLM probability: "))

In [12]:
con = sqlite3.connect(DB_PATH)

def get_query(bug_type: str) -> str:
    return f"SELECT * FROM {DB_TABLE} WHERE bug_type like '%{bug_type}%'"

with open('bug-types.json', 'r') as f:
    bugTypesDir = json.load(f)['data']
bugTypes = [row['type'] for row in bugTypesDir]
bugTypesStr = ", ".join(bugTypes)

selectedBugType = input(f"Select a bug type to train on ({bugTypesStr})")
if not any(selectedBugType in bT for bT in bugTypes):
    raise ValueError('Invalid Bug Type')

ds_df = pd.read_sql_query(get_query(selectedBugType), con)
ds_df['masked_old_contents'] = ds_df.apply(lambda row: mask_code_diff(row['old_contents'], row['new_contents']), axis=1)

tokenizer = RobertaTokenizer.from_pretrained(HF_DIR)

TRAIN_masked, VAL_masked, TRAIN_gt, VAL_gt = train_test_split(ds_df['masked_old_contents'].tolist(), ds_df['new_contents'].tolist(), test_size=VAL_SIZE, random_state=42)

TRAIN_encodings = tokenizer(
    TRAIN_masked,
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

VAL_encodings = tokenizer(
    VAL_masked,
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

TRAIN_GT_input_ids = tokenizer(
    TRAIN_gt,
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
).input_ids

VAL_GT_input_ds = tokenizer(
    VAL_gt,
        max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
).input_ids

In [None]:
logger = init_logger(log_path=LOG_PATH, bugType=selectedBugType, model_dir=MODEL_DIR, version=VERSION)
checkpoint = init_checkpoint(cpkt_path=CPKT_PATH, bugType=selectedBugType, model_dir=MODEL_DIR, version=VERSION)
trainer = Trainer(checkpoint=checkpoint,logger=logger,debug=DEBUG, num_epochs=3)

if len(LOAD_FROM_CPKT) > 0 and  os.path.exists(LOAD_FROM_CPKT):
    model = CodeBertJS.load_from_checkpoint(LOAD_FROM_CPKT)
else:
    model = CodeBertJS()
model.encoder.train()


TRAIN_dataset = CodeBertDataset(TRAIN_encodings, TRAIN_GT_input_ids)
VAL_dataset = CodeBertDataset(VAL_encodings, VAL_GT_input_ds)
dataloader = DataLoader(TRAIN_dataset, batch_size=BATCH_SIZE,num_workers=7, shuffle=True)
val_dataloader = DataLoader(VAL_dataset, batch_size=1, num_workers=7)


trainer.fit(
    model,
    train_dataloaders=dataloader,
    val_dataloaders=val_dataloader
)