In [None]:
from transformers import (
    T5Config, 
    T5ForConditionalGeneration,
    RobertaTokenizer,
)   
import os
import sqlite3
import pandas as pd
from sklearn.model_selection import train_test_split
from modules.models import CodeT5Custom
from modules.datasets import CodeT5Dataset
from modules.TrainConfig import init_checkpoint, init_logger, Trainer, read_hparams
from torch.utils.data import DataLoader


HF_MODEL_DIR = 'Salesforce/codet5-small'
VAL_SIZE = 0.3
TOKENIZER_MAX_LENGTH = 512
LOG_PATH = input('Paste log path: ')
MODEL_DIR = 'CodeT5JS'
CPKT_PATH = '/content/drive/MyDrive/Thesis/checkpoints'
VERSION = int(input('Training Version: '))
DEBUG = True
BATCH_SIZE = int(input('BATCH SIZE : '))
DB_PATH = input('Paste sqlite3 path: ')
LOAD_FROM_CPKT = input("Load from existing model (type cpkt path if true): ")
HPARAMS_PATH = 'hparams.json'

In [None]:
con = sqlite3.connect('commitpack-datasets.db')
ds_df = pd.read_sql_query("select * from commitpackft",con).set_index('index')
ds_df['num_words_old'] = [len(x.split()) for x  in ds_df['old_contents'].tolist()]
ds_df['num_words_new'] = [len(x.split()) for x  in ds_df['new_contents'].tolist()]
ds_df.head()

In [None]:
old_codes = ds_df['old_contents'].tolist()
new_codes = ds_df['new_contents'].tolist()

TRAIN_old, VAL_old, TRAIN_new, VAL_new = train_test_split(old_codes, new_codes, test_size=VAL_SIZE, random_state=42)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained(HF_MODEL_DIR)
TRAIN_encodings = tokenizer(
    TRAIN_old,
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

VAL_encodings = tokenizer(
    VAL_old,
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

TRAIN_decodings = tokenizer(
    TRAIN_new,
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

VAL_decodings = tokenizer(
    VAL_new,
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

In [None]:
logger = init_logger(model_dir=MODEL_DIR, version=VERSION, log_path=LOG_PATH)
checkpoint = init_checkpoint(CPKT_PATH, MODEL_DIR, VERSION)
trainer = Trainer(checkpoint,logger,debug=DEBUG)

In [None]:
if len(LOAD_FROM_CPKT) > 0 and  os.path.exists(LOAD_FROM_CPKT):
    model = CodeT5Custom.load_from_checkpoint(LOAD_FROM_CPKT)
else:
    if not os.path.exists():
        cfg = T5Config(
            dropout_rate=0.01, 
            num_heads=16, # num of att heads in encoder layer
            num_layers=8, # num of hidden layers in encoder layer
            num_decoder_layers=12, # num of hidden layers in decoder layer
        )
    else:
        hparams = read_hparams(HPARAMS_PATH)
        cfg = T5Config(**hparams)
        
    model = CodeT5Custom(t5config=cfg)
    
model.model.train()

In [None]:
TRAIN_dataset = CodeT5Dataset(TRAIN_encodings, TRAIN_decodings)
VAL_dataset = CodeT5Dataset(VAL_encodings, VAL_decodings)
dataloader = DataLoader(TRAIN_dataset, batch_size=BATCH_SIZE,num_workers=7, shuffle=True)
val_dataloader = DataLoader(VAL_dataset, batch_size=1, num_workers=7)

In [None]:
trainer.fit(
    model,
    train_dataloaders=dataloader,
    val_dataloaders=val_dataloader
)