In [1]:
from transformers import (
    T5Config, 
    T5ForConditionalGeneration,
    RobertaTokenizer,
)   
import os
import sqlite3
import pandas as pd
from sklearn.model_selection import train_test_split
from modules.models import T5JSRephraser
from modules.datasets import CodeT5Dataset
from modules.TrainConfig import init_checkpoint, init_logger, Trainer, read_hparams
from torch.utils.data import DataLoader


HF_MODEL_DIR = 'Salesforce/codet5-small'
VAL_SIZE = 0.3
TOKENIZER_MAX_LENGTH = int(input('Paste tokenizer max length: '))
LOG_PATH = input('Paste log path: ')
MODEL_DIR = 'T5JSRephraser'
CPKT_PATH = '/content/drive/MyDrive/Thesis/checkpoints'
VERSION = int(input('Training Version: '))
DEBUG = True
BATCH_SIZE = int(input('BATCH SIZE : '))
DB_PATH = input('Paste sqlite3 path: ')
LOAD_FROM_CPKT = input("Load from existing model (type cpkt path if true): ")
HPARAMS_PATH = input('Paste hparams json path: ')

In [2]:
con = sqlite3.connect(DB_PATH)
ds_df = pd.read_sql_query("select * from commitpackft",con).set_index('index')[:8]
ds_df['num_words_old'] = [len(x.split()) for x  in ds_df['old_contents'].tolist()]
ds_df['num_words_new'] = [len(x.split()) for x  in ds_df['new_contents'].tolist()]
ds_df.head()

Unnamed: 0_level_0,commit,old_file,new_file,old_contents,new_contents,subject,message,lang,license,repos,num_words_old,num_words_new
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,05e68b98a7d38efb95cca834e95ae35387bba730,static/js/entity.js,static/js/entity.js,(function () {\n\nvar $specContainer = $('#spe...,(function () {\n\nvar $body = $('body');\n\nif...,Fix scrollspy bug occured by layout changes,Fix scrollspy bug occured by layout changes\n,JavaScript,apache-2.0,"teampopong/pokr.kr,teampopong/pokr.kr,teampopo...",90,88
1,e9c17be9481632fd5f82b35c0a5782761c051122,week-9/dom-manipulation/home_page.js,week-9/dom-manipulation/home_page.js,// DOM Manipulation Challenge\n\n\n// I worked...,// DOM Manipulation Challenge\n\n\n// I worked...,"Fix text for partner, add questions for reflec...","Fix text for partner, add questions for reflec...",JavaScript,mit,"TimBek2/phase-0,TimBek2/phase-0,TimBek2/phase-0",174,172
2,e98716b1f18ffc2b2b8a6b06e401bda45f41794b,packages/motion/template/main.js,packages/motion/template/main.js,/* @flow */\n\nimport React from 'react'\n\ncl...,/* @flow */\n\nimport Motion from 'react'\n\nc...,Fix a typo in template We need the name to be ...,:bug: Fix a typo in template\nWe need the name...,JavaScript,mpl-2.0,"motion/motion,flintjs/flint,flintjs/flint,moti...",22,22
3,ad6803ee350bc6189e5b4227756a4c201e026a1f,src/schedule.js,src/schedule.js,"""use strict"";\nvar global = require(""./global....","""use strict"";\nvar global = require(""./global....",Use a smaller timeout value,Use a smaller timeout value\n,JavaScript,mit,"tesfaldet/bluebird,arenaonline/bluebird,avinoa...",157,157
4,f01144f381f7e91c5d25d3dbbdf06d23fcd1ae5a,index.js,index.js,/* jshint node: true */\n'use strict';\n\nmodu...,/* jshint node: true */\n'use strict';\n\nmodu...,Remove version when client is used,Remove version when client is used\n\nUpon fur...,JavaScript,mit,"asennikov/ember-g-map,asennikov/ember-g-map",158,153


In [3]:
old_codes = ds_df['old_contents'].tolist()
new_codes = ds_df['new_contents'].tolist()

TRAIN_old, VAL_old, TRAIN_new, VAL_new = train_test_split(old_codes, new_codes, test_size=VAL_SIZE, random_state=42)

In [4]:
tokenizer = RobertaTokenizer.from_pretrained(HF_MODEL_DIR)
TRAIN_encodings = tokenizer(
    TRAIN_old,
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

VAL_encodings = tokenizer(
    VAL_old,
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

TRAIN_decodings = tokenizer(
    TRAIN_new,
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

VAL_decodings = tokenizer(
    VAL_new,
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

In [5]:
logger = init_logger(model_dir=MODEL_DIR, version=VERSION, log_path=LOG_PATH)
checkpoint = init_checkpoint(CPKT_PATH, MODEL_DIR, VERSION)
trainer = Trainer(checkpoint,logger,num_epochs=5,debug=DEBUG)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.


In [6]:
if len(LOAD_FROM_CPKT) > 0 and os.path.exists(LOAD_FROM_CPKT):
    model = T5JSRephraser.load_from_checkpoint(LOAD_FROM_CPKT)
else:
    if not os.path.exists(HPARAMS_PATH):
        cfg = T5Config(
            dropout_rate=0.01, 
            num_heads=16, # num of att heads in encoder layer
            num_layers=8, # num of hidden layers in encoder layer
            num_decoder_layers=12, # num of hidden layers in decoder layer,
            # the number of the buckets for the relative att mechanism
            # higher the value is, the longer the range of the word-dependencies
            # the model captures
            relative_attention_num_buckets= 32,
            # The higher range between word-elements
            # the att mechanism will consider
            relative_attention_max_distance=128,
            decoder_start_token_id=tokenizer.pad_token_id
        )
    else:
        hparams = read_hparams(HPARAMS_PATH,tokenizer.pad_token_id)
        cfg = T5Config(**hparams)
        
    model = T5JSRephraser(t5config=cfg)
    
model.model.train()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(64, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [7]:
TRAIN_dataset = CodeT5Dataset(TRAIN_encodings, TRAIN_decodings)
VAL_dataset = CodeT5Dataset(VAL_encodings, VAL_decodings)
dataloader = DataLoader(TRAIN_dataset, batch_size=BATCH_SIZE,num_workers=7, shuffle=True)
val_dataloader = DataLoader(VAL_dataset, batch_size=1, num_workers=7)

In [8]:
trainer.fit(
    model,
    train_dataloaders=dataloader,
    val_dataloaders=val_dataloader
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 55.8 M
-----------------------------------------------------
55.8 M    Trainable params
0         Non-trainable params
55.8 M    Total params
223.153   Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=1` reached.
