In [1]:
import os
import pandas as pd
import sqlite3
from transformers import (
    RobertaTokenizer,
)
from modules.models import CodeT5
from modules.datasets import CodeT5Dataset
from modules.TrainConfig import init_logger, init_checkpoint, Trainer
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import json

In [2]:
HF_DIR = 'Salesforce/codet5-small'
TOKENIZER_MAX_LENGTH = int(input('Tokenizer Max length: '))
DB_PATH = input('Paste sqlite3 path: ')
DB_TABLE = input('Paste sqlite3 dataset table name: ')

if not os.path.exists(DB_PATH):
    raise RuntimeError('sqlite3 path doesnt exist.')
VAL_SIZE = 0.3
LOG_PATH = input('Paste Log path:')
MODEL_DIR = 'CodeT5JS'
VERSION = int(input('Training version: '))
LOAD_FROM_CPKT = input("Load from existing model (type cpkt path if true): ")
BATCH_SIZE = int(input('Batch size: '))
DEBUG = True if int(input('Debug Run (1,0): ')) == 1 else False
CPKT_PATH = input('Paste checkpoints dir: ')

In [3]:
con = sqlite3.connect(DB_PATH)

def get_query(bug_type: str) -> str:
    return f"SELECT * FROM {DB_TABLE} WHERE bug_type like '%{bug_type}%'"

with open('bug-types.json', 'r') as f:
    bugTypesDir = json.load(f)['data']
bugTypes = [row['type'] for row in bugTypesDir]
bugTypesStr = ", ".join(bugTypes)

selectedBugType = input(f"Select a bug type to train on ({bugTypesStr})")
if not any(selectedBugType in bT for bT in bugTypes):
    raise ValueError('Invalid Bug Type')

ds_df = pd.read_sql_query(get_query(selectedBugType), con)
old_codes = ds_df['old_contents'].tolist()
new_codes = ds_df['new_contents'].tolist()

TRAIN_old, VAL_old, TRAIN_new, VAL_new = train_test_split(old_codes, new_codes, test_size=VAL_SIZE, random_state=42)

print(f"Selected Bug Type : {selectedBugType}\nFound {len(ds_df)} samples")

In [4]:
tokenizer = RobertaTokenizer.from_pretrained(HF_DIR)

TRAIN_encodings = tokenizer(
    TRAIN_old,
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

VAL_encodings = tokenizer(
    VAL_old,
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

TRAIN_decodings = tokenizer(
    TRAIN_new,
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

VAL_decodings = tokenizer(
    VAL_new,
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

In [5]:
logger = init_logger(log_path=LOG_PATH, bugType=selectedBugType, model_dir=MODEL_DIR, version=VERSION)
checkpoint = init_checkpoint(cpkt_path=CPKT_PATH, bugType=selectedBugType, model_dir=MODEL_DIR, version=VERSION)
trainer = Trainer(checkpoint,logger,debug=DEBUG, num_epochs=3)

if len(LOAD_FROM_CPKT) > 0 and  os.path.exists(LOAD_FROM_CPKT):
    model = CodeT5.load_from_checkpoint(LOAD_FROM_CPKT)
else:
    model = CodeT5()
model.model.train()


TRAIN_dataset = CodeT5Dataset(TRAIN_encodings, TRAIN_decodings)
VAL_dataset = CodeT5Dataset(VAL_encodings, VAL_decodings)
dataloader = DataLoader(TRAIN_dataset, batch_size=BATCH_SIZE,num_workers=7, shuffle=True)
val_dataloader = DataLoader(VAL_dataset, batch_size=1, num_workers=7)


trainer.fit(
    model,
    train_dataloaders=dataloader,
    val_dataloaders=val_dataloader
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.
/home/disras/miniconda3/envs/thesis/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /home/disras/projects/JSRepair/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
241.969   Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=1` reached.
