In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!git clone https://github.com/radistoubalidis/JSRepair.git

!pip install pytorch_lightning
!python -m pip install lightning
!pip install datasets
!pip install python-dotenv

In [None]:
%cd ./JSRepair

In [2]:
import os
import pandas as pd
import sqlite3
from transformers import (
    RobertaTokenizer,
)
from modules.models import CodeT5
from modules.datasets import CodeT5Dataset
from modules.TrainConfig import init_logger, init_checkpoint, Trainer
from modules.filters import add_labels
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from typing import List
import torch

In [3]:
HF_DIR = 'Salesforce/codet5-small'
TOKENIZER_MAX_LENGTH = 420 #int(input('Tokenizer Max length: '))
DB_PATH = 'commitpack-datasets.db' if os.path.exists('commitpack-datasets.db') else '/content/drive/MyDrive/Thesis/commitpack-datasets.db'
DB_TABLE = 'commitpackft_classified_train'
if not os.path.exists(DB_PATH):
    raise RuntimeError('sqlite3 path doesnt exist.')
VAL_SIZE = 0.3
LOG_PATH = 'logs' if os.path.exists('logs') else '/content/drive/MyDrive/Thesis/logs'
VERSION = int(input('Training version: '))
LOAD_FROM_CPKT = input("Load from existing model (type cpkt path if true): ")
DEBUG = True if int(input('Debug Run (1,0): ')) == 1 else False
BATCH_SIZE = 2 if DEBUG is True else 32
CPKT_PATH = 'checkpoints' if os.path.exists('checkpoints') else '/content/drive/MyDrive/Thesis/checkpoints'
WITH_MOBILE = True if int(input('Consider mobile class (1,0): ')) == 1 else False

if WITH_MOBILE:
    classLabels = {
        "mobile" : 0.,
        "functionality" : 0.,
        "ui-ux" : 0.,
        "compatibility-performance" : 0.,
        "network-security" : 0.,
        "general": 0.
    }
else:
    classLabels = {
        "functionality" : 0.,
        "ui-ux" : 0.,
        "compatibility-performance" : 0.,
        "network-security" : 0.,
        "general": 0.
    }
    
num_classes = len(classLabels.keys())
MODEL_DIR = f"CodeT5JS_{num_classes}classes"
con = sqlite3.connect(DB_PATH)

# Types of Bugs distribution in samples

In [4]:
with open('bug-type-dist-query_train.sql', 'r') as f:
    query = f.read()

info_df = pd.read_sql_query(query, con)
info_df

Unnamed: 0,count(*),bug_type
0,90,mobile
1,2862,general
2,3147,ui-ux
3,3159,network-security
4,4396,compatibility-performance
5,4532,functionality


# Create Classification Labels

```json
{
    "mobile" : 0,
    "functionality" : 0,
    "ui-ux" : 0,
    "compatibility-performance" : 0,
    "network-security" : 0,
    "general": 0
}

Ένα δείγμα που κατηγοριοποιήθηκε ως σφάλμα λειτουργικότητας(functionality) και ui-ux θα έχει διάνυσμα ταξινόμησης ->
[0,1,1,0,0,0]
```


In [5]:
from random import sample


def load_ds() -> pd.DataFrame:
    query = f"select * from {DB_TABLE}"
    ds_df = pd.read_sql_query(query, con)
    return ds_df

ds_df = load_ds()

ds_df['class_labels'] = ds_df['bug_type'].apply(lambda bT: add_labels(bT.split(','), classLabels))
if DEBUG:
    ds_df = ds_df.iloc[:10]
ds_df

Unnamed: 0,index,commit,old_file,new_file,old_contents,new_contents,subject,message,lang,license,repos,processed_message,is_bug,bug_type,class_labels
0,14193,225ae89c984227f9c2cfbe0278618758256e577f,server.js,server.js,\n/**\n * @description a string as a paramete...,var express = require('express')\nvar strftime...,Add example code to test Heroku error app not ...,Add example code to test Heroku error app not ...,JavaScript,mit,"kiwi-lifter/timestamp-api,kiwi-lifter/timestam...",add exampl code test heroku error app load,1,"functionality,compatibility-performance","[1.0, 0.0, 1.0, 0.0, 0.0]"
1,17090,e9286911fcaa253b094eea4b532f07c31e9f3ee5,packages/plugins/sentry/server/middlewares/sen...,packages/plugins/sentry/server/middlewares/sen...,'use strict';\n\n/**\n * Programmatic sentry m...,'use strict';\n\n/**\n * Programmatic sentry m...,Fix calling sendError on wrong object,Fix calling sendError on wrong object\n\nFixes...,JavaScript,mit,"wistityhq/strapi,wistityhq/strapi",fix call senderror wrong object fix sentri plu...,1,"mobile,functionality,compatibility-performance","[1.0, 0.0, 1.0, 0.0, 0.0]"
2,3401,8b512427144c1a8ff55c149267708fd783754405,app/components/wallet/backup-recovery/WalletRe...,app/components/wallet/backup-recovery/WalletRe...,"// @flow\nimport React, { Component, PropTypes...","// @flow\nimport React, { Component, PropTypes...",Fix proptype checking in wallet recovery instr...,Fix proptype checking in wallet recovery instr...,JavaScript,apache-2.0,"input-output-hk/daedalus,input-output-hk/daeda...",fix proptyp check wallet recoveri instruct compon,1,functionality,"[1.0, 0.0, 0.0, 0.0, 0.0]"
3,33381,1ea806751405a7bfd7bf388eeabce2de0ad5c50f,templates:array.js,templates:array.js,ReactiveArray;\n\nthis.ReactiveArray = Reactiv...,ReactiveArray;\n\nthis.ReactiveArray = Reactiv...,Fix set function with underscore isEqual,Fix set function with underscore isEqual,JavaScript,mit,meteortemplates/array,fix set function underscor isequ,1,functionality,"[1.0, 0.0, 0.0, 0.0, 0.0]"
4,33412,5551c176fc2c5fa59de1fbd29f36a4b2538ae85e,bin/mai-chai-init.js,bin/mai-chai-init.js,'use strict';\n/* global require */\n/* global...,'use strict';\n/* global require */\n/* global...,Fix regex for win32 platform,Fix regex for win32 platform\n,JavaScript,mit,epsitec-sa/mai-chai,fix regex win32 platform,1,"ui-ux,compatibility-performance","[0.0, 1.0, 1.0, 0.0, 0.0]"
5,12573,1c559c078519a305d7723da0adf0d64f074f2982,src/js/config.js,src/js/config.js,// Parameters for the application\n\nexport de...,// Parameters for the application\n\nexport de...,Fix wrong URL for remote calls,Fix wrong URL for remote calls\n\n,JavaScript,mit,"UNECE/Model-Explorer,UNECE/Model-Explorer",fix wrong url remot call,1,network-security,"[0.0, 0.0, 0.0, 1.0, 0.0]"
6,51075,7f292e1e1c3a7147331a462e2ded5e3a01cdfe7f,server/public/scripts/controllers/home.control...,server/public/scripts/controllers/home.control...,"app.controller('HomeController', ['$http', 'Au...","app.controller('HomeController', ['$http', 'Au...",Refactor carousel image references for better ...,Refactor carousel image references for better ...,JavaScript,mit,"STEMentor/STEMentor,STEMentor/STEMentor",refactor carousel imag refer better readabl,1,general,"[0.0, 0.0, 0.0, 0.0, 1.0]"
7,21493,f2ccefd95ab02a80e905c932d1146728802bab0a,source/get-response.js,source/get-response.js,'use strict';\nconst decompressResponse = requ...,'use strict';\nconst decompressResponse = requ...,Fix `options.encoding` if not decompressing Br...,Fix `options.encoding` if not decompressing Br...,JavaScript,mit,sindresorhus/got,fix optionsencod decompress brotli,1,general,"[0.0, 0.0, 0.0, 0.0, 1.0]"
8,39641,e524763effb091e8b1bd48b1ac890bf7da1578ca,resources/assets/js/app.js,resources/assets/js/app.js,/**\n * First we will load all of this project...,/**\n * First we will load all of this project...,Fix issue with Vue not working,Fix issue with Vue not working\n,JavaScript,mit,"anodyne/aurora,anodyne/aurora,anodyne/aurora",fix issu vue work,1,compatibility-performance,"[0.0, 0.0, 1.0, 0.0, 0.0]"
9,23782,2e05171229b83c3cc0a7decffe9310368f1a3858,app/assets/javascripts/user_dashboard.js,app/assets/javascripts/user_dashboard.js,"(function(){\n ""use strict""\n $(document).re...","(function(){\n ""use strict""\n $(document).re...",Update naming convention to fix bug,Update naming convention to fix bug\n,JavaScript,mit,"TerrenceLJones/not-bored-tonight,TerrenceLJone...",updat name convent fix bug,1,general,"[0.0, 0.0, 0.0, 0.0, 1.0]"


In [6]:
old_codes = ds_df[['old_contents', 'class_labels']]
new_codes = ds_df[['new_contents', 'class_labels']]

TRAIN_old, VAL_old, TRAIN_new, VAL_new = train_test_split(old_codes, new_codes, test_size=VAL_SIZE, random_state=42)

print(f"Total training samples: {len(ds_df)}")

Total training samples: 10


## Dataset

In [7]:
tokenizer = RobertaTokenizer.from_pretrained(HF_DIR)

TRAIN_encodings = tokenizer(
    TRAIN_old['old_contents'].tolist(),
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

VAL_encodings = tokenizer(
    VAL_old['old_contents'].tolist(),
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

TRAIN_decodings = tokenizer(
    TRAIN_new['new_contents'].tolist(),
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

VAL_decodings = tokenizer(
    VAL_new['new_contents'].tolist(),
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

## Convert Class Labels into tensors

In [8]:
TRAIN_classes = torch.tensor(TRAIN_old['class_labels'].tolist())
VAL_classes = torch.tensor(VAL_old['class_labels'].tolist())
TRAIN_classes

tensor([[1., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1.],
        [0., 1., 1., 0., 0.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1.]])

# Initialize Training Settings

In [9]:
logger = init_logger(log_path=LOG_PATH, model_dir=MODEL_DIR, version=VERSION)
checkpoint = init_checkpoint(cpkt_path=CPKT_PATH, model_dir=MODEL_DIR, version=VERSION)
trainer = Trainer(checkpoint,logger,debug=DEBUG, num_epochs=3)

if len(LOAD_FROM_CPKT) > 0 and  os.path.exists(LOAD_FROM_CPKT):
    model = CodeT5.load_from_checkpoint(LOAD_FROM_CPKT, num_classes=num_classes)
else:
    model = CodeT5(num_classes=num_classes)
model.model.train()

TRAIN_dataset = CodeT5Dataset(TRAIN_encodings, TRAIN_decodings, TRAIN_classes)
VAL_dataset = CodeT5Dataset(VAL_encodings, VAL_decodings, VAL_classes)
dataloader = DataLoader(TRAIN_dataset, batch_size=BATCH_SIZE,num_workers=7, shuffle=True)
val_dataloader = DataLoader(VAL_dataset, batch_size=1, num_workers=7)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.


In [10]:
trainer.fit(
    model,
    train_dataloaders=dataloader,
    val_dataloaders=val_dataloader
)

/home/disras/miniconda3/envs/thesis/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/disras/projects/JSRepair/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type                       | Params | Mode 
------------------------------------------------------------------
0 | model      | T5ForConditionalGeneration | 222 M  | train
1 | classifier | Linear                     | 3.8 K  | train
------------------------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.544   Total estimated model params size (MB)
542       Modules in train mode
0         Modules in eval mode


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=1` reached.
