In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!git clone https://github.com/radistoubalidis/JSRepair.git

!pip install pytorch_lightning
!python -m pip install lightning
!pip install datasets
!pip install python-dotenv

In [None]:
%cd ./JSRepair

In [1]:
import os
import pandas as pd
import sqlite3
import torch
import numpy as np
from transformers import (
    RobertaTokenizer,
)
from modules.models import CodeT5
from modules.datasets import CodeT5Dataset
from modules.TrainConfig import init_logger, init_checkpoint, Trainer
from modules.filters import add_labels
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from typing import List

In [None]:
HF_DIR = 'Salesforce/codet5-large'
TOKENIZER_MAX_LENGTH = 1024 #int(input('Tokenizer Max length: '))
DB_PATH = 'commitpack-datasets.db' if os.path.exists('commitpack-datasets.db') else '/content/drive/MyDrive/Thesis/commitpack-datasets.db'
DB_TABLE = 'commitpackft_classified_train'
if not os.path.exists(DB_PATH):
    raise RuntimeError('sqlite3 path doesnt exist.')
VAL_SIZE = 0.3
LOG_PATH = 'logs' if os.path.exists('logs') else '/content/drive/MyDrive/Thesis/logs'
VERSION = int(input('Training version: '))
LOAD_FROM_CPKT = input("Load from existing model (type cpkt path if true): ")
DEBUG = True if int(input('Debug Run (1,0): ')) == 1 else False
BATCH_SIZE = 2 if DEBUG is True else 32
CPKT_PATH = 'checkpoints' if os.path.exists('checkpoints') else '/content/drive/MyDrive/Thesis/checkpoints'
DROPOUT_RATE = float(input('Type dropout rate for classifier: '))
WITH_MOBILE = True if int(input('Consider mobile class (1,0): ')) == 1 else False
WITH_LAYER_NORM = True
WITH_ACTIVATION = True

if WITH_MOBILE:
    classLabels = {
        "mobile" : 0.,
        "functionality" : 0.,
        "ui-ux" : 0.,
        "compatibility-performance" : 0.,
        "network-security" : 0.,
        "general": 0.
    }
else:
    classLabels = {
        "functionality" : 0.,
        "ui-ux" : 0.,
        "compatibility-performance" : 0.,
        "network-security" : 0.,
        "general": 0.
    }
    
num_classes = len(classLabels.keys())
modelSize = HF_DIR.split('-')[-1]
MODEL_DIR = f"CodeT5_{modelSize}_JS_{num_classes}classes_{TOKENIZER_MAX_LENGTH}MaxL"
con = sqlite3.connect(DB_PATH)

# Types of Bugs distribution in samples

In [3]:
with open('bug-type-dist-query_train.sql', 'r') as f:
    query = f.read()

info_df = pd.read_sql_query(query, con)
info_df

Unnamed: 0,count(*),bug_type
0,90,mobile
1,2862,general
2,3147,ui-ux
3,3159,network-security
4,4396,compatibility-performance
5,4532,functionality


# Create Classification Labels

```json
{
    "mobile" : 0,
    "functionality" : 0,
    "ui-ux" : 0,
    "compatibility-performance" : 0,
    "network-security" : 0,
    "general": 0
}

Ένα δείγμα που κατηγοριοποιήθηκε ως σφάλμα λειτουργικότητας(functionality) και ui-ux θα έχει διάνυσμα ταξινόμησης ->
[0,1,1,0,0,0]
```


In [4]:
tokenizer = RobertaTokenizer.from_pretrained(HF_DIR)


def load_ds() -> pd.DataFrame:
    query = f"select * from {DB_TABLE}"
    ds_df = pd.read_sql_query(query, con)
    return ds_df

ds_df = load_ds()

ds_df['class_labels'] = ds_df['bug_type'].apply(lambda bT: add_labels(bT.split(','), classLabels))
if DEBUG:
    ds_df = ds_df.iloc[:10]

if not WITH_MOBILE:
    ds_df = ds_df[ds_df['bug_type'] != 'mobile']

ds_df.head()

Downloading tokenizer_config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/511k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Unnamed: 0,index,commit,old_file,new_file,old_contents,new_contents,subject,message,lang,license,repos,processed_message,is_bug,bug_type,class_labels
0,14193,225ae89c984227f9c2cfbe0278618758256e577f,server.js,server.js,\n/**\n * @description a string as a paramete...,var express = require('express')\nvar strftime...,Add example code to test Heroku error app not ...,Add example code to test Heroku error app not ...,JavaScript,mit,"kiwi-lifter/timestamp-api,kiwi-lifter/timestam...",add exampl code test heroku error app load,1,"functionality,compatibility-performance","[1.0, 0.0, 1.0, 0.0, 0.0]"
1,17090,e9286911fcaa253b094eea4b532f07c31e9f3ee5,packages/plugins/sentry/server/middlewares/sen...,packages/plugins/sentry/server/middlewares/sen...,'use strict';\n\n/**\n * Programmatic sentry m...,'use strict';\n\n/**\n * Programmatic sentry m...,Fix calling sendError on wrong object,Fix calling sendError on wrong object\n\nFixes...,JavaScript,mit,"wistityhq/strapi,wistityhq/strapi",fix call senderror wrong object fix sentri plu...,1,"mobile,functionality,compatibility-performance","[1.0, 0.0, 1.0, 0.0, 0.0]"
2,3401,8b512427144c1a8ff55c149267708fd783754405,app/components/wallet/backup-recovery/WalletRe...,app/components/wallet/backup-recovery/WalletRe...,"// @flow\nimport React, { Component, PropTypes...","// @flow\nimport React, { Component, PropTypes...",Fix proptype checking in wallet recovery instr...,Fix proptype checking in wallet recovery instr...,JavaScript,apache-2.0,"input-output-hk/daedalus,input-output-hk/daeda...",fix proptyp check wallet recoveri instruct compon,1,functionality,"[1.0, 0.0, 0.0, 0.0, 0.0]"
3,33381,1ea806751405a7bfd7bf388eeabce2de0ad5c50f,templates:array.js,templates:array.js,ReactiveArray;\n\nthis.ReactiveArray = Reactiv...,ReactiveArray;\n\nthis.ReactiveArray = Reactiv...,Fix set function with underscore isEqual,Fix set function with underscore isEqual,JavaScript,mit,meteortemplates/array,fix set function underscor isequ,1,functionality,"[1.0, 0.0, 0.0, 0.0, 0.0]"
4,33412,5551c176fc2c5fa59de1fbd29f36a4b2538ae85e,bin/mai-chai-init.js,bin/mai-chai-init.js,'use strict';\n/* global require */\n/* global...,'use strict';\n/* global require */\n/* global...,Fix regex for win32 platform,Fix regex for win32 platform\n,JavaScript,mit,epsitec-sa/mai-chai,fix regex win32 platform,1,"ui-ux,compatibility-performance","[0.0, 1.0, 1.0, 0.0, 0.0]"


In [5]:
old_codes = ds_df[['message', 'old_contents', 'class_labels']]
old_codes['input_seq'] = old_codes['message'] + ' ' + tokenizer.sep_token + ' ' + old_codes['old_contents']
new_codes = ds_df[['message', 'new_contents', 'class_labels']]

TRAIN_old, VAL_old, TRAIN_new, VAL_new = train_test_split(old_codes, new_codes, test_size=VAL_SIZE, random_state=42)

print(f"Total training samples: {len(ds_df)}")

Total training samples: 10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  old_codes['input_seq'] = old_codes['message'] + ' ' + tokenizer.sep_token + ' ' + old_codes['old_contents']


## Dataset

In [6]:
TRAIN_encodings = tokenizer(
    TRAIN_old['input_seq'].tolist(),
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

VAL_encodings = tokenizer(
    VAL_old['input_seq'].tolist(),
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

TRAIN_decodings = tokenizer(
    TRAIN_new['new_contents'].tolist(),
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

VAL_decodings = tokenizer(
    VAL_new['new_contents'].tolist(),
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

## Convert Class Labels into tensors

In [7]:
TRAIN_classes = torch.tensor(TRAIN_old['class_labels'].tolist())
VAL_classes = torch.tensor(VAL_old['class_labels'].tolist())
TRAIN_classes

tensor([[1., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1.],
        [0., 1., 1., 0., 0.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1.]])

### Compute class weights
$pos\ weight[i] = (Number\ of\ negative\ samples\ for\ class\ i) / (Number\ of\ positive\ samples\ for\ class\ i)$

In [8]:
num_samples = TRAIN_classes.size(0)
num_classes = TRAIN_classes.size(1)

pos_counts = torch.sum(TRAIN_classes, dim=0)
neg_counts = num_samples - pos_counts
class_weights = neg_counts / (pos_counts + 1e-6)
class_weights = class_weights.numpy()

# Initialize Training Settings

In [9]:
logger = init_logger(log_path=LOG_PATH, model_dir=MODEL_DIR, version=VERSION)
checkpoint = init_checkpoint(cpkt_path=CPKT_PATH, model_dir=MODEL_DIR, version=VERSION)
trainer = Trainer(checkpoint,logger,debug=DEBUG, num_epochs=5)

if len(LOAD_FROM_CPKT) > 0 and  os.path.exists(LOAD_FROM_CPKT):
    model = CodeT5.load_from_checkpoint(
        LOAD_FROM_CPKT, 
        class_weights=class_weights, 
        num_classes=num_classes,
        dropout_rate=DROPOUT_RATE,
        with_layer_norm=WITH_LAYER_NORM,
        with_activation=WITH_ACTIVATION
    )
else:
    model = CodeT5(
        class_weights=class_weights, 
        num_classes=num_classes, 
        dropout_rate=DROPOUT_RATE,
        with_layer_norm=WITH_LAYER_NORM,
        with_activation=WITH_ACTIVATION
    )
model.model.train()

TRAIN_dataset = CodeT5Dataset(TRAIN_encodings, TRAIN_decodings, TRAIN_classes)
VAL_dataset = CodeT5Dataset(VAL_encodings, VAL_decodings, VAL_classes)
dataloader = DataLoader(TRAIN_dataset, batch_size=BATCH_SIZE,num_workers=14, shuffle=True)
val_dataloader = DataLoader(VAL_dataset, batch_size=1, num_workers=14)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.


### Save Model Config to CSV 

In [10]:
modelConfigsCSV = f"/content/drive/MyDrive/Thesis/model-configs.csv"
if os.path.exists(modelConfigsCSV):
    modelConfig = {
        'name': MODEL_DIR,
        'tokenizer_max_length': TOKENIZER_MAX_LENGTH,
        'num_classes': num_classes,
        'dropout_rate': DROPOUT_RATE,
        'with_activation': WITH_ACTIVATION,
        'with_layer_norm': WITH_LAYER_NORM
    }
    modelConfig_df = pd.DataFrame([modelConfig])
    modelConfig_df.to_csv(modelConfigsCSV, mode='a', index=False, header=False)

### Run Training

In [11]:
trainer.fit(
    model,
    train_dataloaders=dataloader,
    val_dataloaders=val_dataloader
)

/home/disras/miniconda3/envs/thesis/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/disras/projects/JSRepair/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type                       | Params | Mode 
--------------------------------------------------------------------
0 | model        | T5ForConditionalGeneration | 222 M  | train
1 | layer_norm   | LayerNorm                  | 1.5 K  | train
2 | hidden_layer | Linear                     | 295 K  | train
3 | activation   | ReLU                       | 0      | train
4 | dropout      | Dropout                    | 0      | train
5 | classifier   | Linear                     | 3.8 K  | train
--------------------------------------------------------------------
223 M     Trainable params
0         Non-trainable params
223 M     Total params
892.731   Total estimated model params size (MB)
546       Modules in train mode
0         

Training: |          | 0/? [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=1` reached.
