In [1]:
import os
from transformers import RobertaTokenizer
from sklearn.model_selection import train_test_split
from modules.TrainConfig import init_checkpoint, init_logger, Trainer, masker
from modules.models import CodeBertJS
from modules.datasets import CodeBertDataset
from torch.utils.data import DataLoader
import sqlite3
import pandas as pd

In [2]:
HF_DIR = 'microsoft/codebert-base-mlm'
TOKENIZER_MAX_LENGTH = int(input('Tokenizer Max length: '))
DB_PATH = input('Paste sqlite3 path: ')
if not os.path.exists(DB_PATH):
    raise RuntimeError('sqlite3 path doesnt exist.')
VAL_SIZE = 0.3
LOG_PATH = input('Paste Log path:')
MODEL_DIR = 'CodeBertJS'
VERSION = int(input('Training version: '))
LOAD_FROM_CPKT = input("Load from existing model (type cpkt path if true): ")
BATCH_SIZE = int(input('Batch size: '))
DEBUG = True
CPKT_PATH = input('Paste checkpoints dir: ')
MASK_PROB = float(input("Paste MLM probability: "))

In [3]:
with open('query.sql','r') as f:
    query = f.read()

con = sqlite3.connect(DB_PATH)
ds_df = pd.read_sql_query(query, con).set_index('index')
new_codes = ds_df['new_contents'].tolist()
tokenizer = RobertaTokenizer.from_pretrained(HF_DIR)


def test_masker(code):
    try:
        return masker(code, tokenizer)
    except Exception as e:
        print(code)
        raise e

masked_new_codes = list(map(lambda c: test_masker(c), new_codes))
TRAIN_masked, VAL_masked, TRAIN_gt, VAL_gt = train_test_split(masked_new_codes, new_codes, test_size=VAL_SIZE, random_state=42)

TRAIN_encodings = tokenizer(
    TRAIN_masked,
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

VAL_encodings = tokenizer(
    VAL_masked,
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

TRAIN_GT_input_ids = tokenizer(
    TRAIN_gt,
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
).input_ids

VAL_GT_input_ds = tokenizer(
    VAL_gt,
        max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
).input_ids

In [5]:
logger = init_logger(log_path=LOG_PATH, model_dir=MODEL_DIR, version=VERSION)
checkpoint = init_checkpoint(cpkt_path=CPKT_PATH, model_dir=MODEL_DIR, version=VERSION)
trainer = Trainer(checkpoint=checkpoint,logger=logger,debug=DEBUG, num_epochs=3)

if len(LOAD_FROM_CPKT) > 0 and  os.path.exists(LOAD_FROM_CPKT):
    model = CodeBertJS.load_from_checkpoint(LOAD_FROM_CPKT)
else:
    model = CodeBertJS()
model.encoder.train()


TRAIN_dataset = CodeBertDataset(TRAIN_encodings, TRAIN_GT_input_ids)
VAL_dataset = CodeBertDataset(VAL_encodings, VAL_GT_input_ds)
dataloader = DataLoader(TRAIN_dataset, batch_size=4,num_workers=7, shuffle=True)
val_dataloader = DataLoader(VAL_dataset, batch_size=1, num_workers=7)


trainer.fit(
    model,
    train_dataloaders=dataloader,
    val_dataloaders=val_dataloader
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.
Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=1` reached.
