In [1]:
import pandas as pd
import sqlite3
from transformers import (
    RobertaTokenizer,
)
from modules.models import CodeT5
from modules.datasets import CodeT5Dataset
from modules.TrainConfig import init_logger, init_checkpoint, Trainer
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
LOG_PATH = './logs',
MODEL_DIR = 'CodeT5JS'
CPKT_PATH = 'Model Checkpoints/CodeT5JS'
VERSION = 0
DEBUG = True

In [3]:
print('1. Loading Samples..')
con = sqlite3.connect('commitpack-datasets.db')

with open('query.sql', 'r') as f:
    query = f.read()
ds_df = pd.read_sql_query(query,con).set_index('index')[:12]
old_codes = ds_df['old_contents'].tolist()
new_codes = ds_df['new_contents'].tolist()

TRAIN_old, VAL_old, TRAIN_new, VAL_new = train_test_split(old_codes, new_codes, test_size=0.15, random_state=42)

1. Loading Samples..


In [4]:
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')

TRAIN_encodings = tokenizer(
    TRAIN_old,
    max_length=512,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

VAL_encodings = tokenizer(
    VAL_old,
    max_length=512,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

TRAIN_decodings = tokenizer(
    TRAIN_new,
    max_length=512,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

VAL_decodings = tokenizer(
    VAL_new,
    max_length=512,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

In [5]:
logger = init_logger(LOG_PATH, MODEL_DIR, VERSION)
checkpoint = init_checkpoint(CPKT_PATH, MODEL_DIR, VERSION)
trainer = Trainer(checkpoint,logger,debug=DEBUG)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.


In [6]:
model = CodeT5()
model.model.train()

T5ForConditionalGeneration(
  (shared): Embedding(32100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [9]:
TRAIN_dataset = CodeT5Dataset(TRAIN_encodings, TRAIN_decodings)
VAL_dataset = CodeT5Dataset(VAL_encodings, VAL_decodings)
dataloader = DataLoader(TRAIN_dataset, batch_size=4,num_workers=7, shuffle=True)
val_dataloader = DataLoader(VAL_dataset, batch_size=1, num_workers=7)

In [10]:
trainer.fit(
    model,
    train_dataloaders=dataloader,
    val_dataloaders=val_dataloader
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.528   Total estimated model params size (MB)
`Trainer.fit` stopped: `max_steps=1` reached.
