In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!git clone https://github.com/radistoubalidis/JSRepair.git

!pip install pytorch_lightning
!pip install datasets
!pip install python-dotenv

In [None]:
%cd ./JSRepair

In [None]:
import os
import pandas as pd
import sqlite3
from transformers import (
    RobertaTokenizer,
)
from modules.models import CodeT5
from modules.datasets import CodeT5Dataset
from modules.TrainConfig import init_logger, init_checkpoint, Trainer
from modules.filters import add_labels
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from typing import List
import torch

In [None]:
HF_DIR = 'Salesforce/codet5-small'
TOKENIZER_MAX_LENGTH = 420 #int(input('Tokenizer Max length: '))
DB_PATH = 'commitpack-datasets.db' if os.path.exists('commitpack-datasets.db') else '/content/drive/MyDrive/Thesis/commitpack-datasets.db'
DB_TABLE = 'commitpackft_classified_train'

if not os.path.exists(DB_PATH):
    raise RuntimeError('sqlite3 path doesnt exist.')
VAL_SIZE = 0.3
LOG_PATH = 'logs' if os.path.exists('logs') else '/content/drive/MyDrive/Thesis/logs'
MODEL_DIR = 'CodeT5JS'
VERSION = int(input('Training version: '))
LOAD_FROM_CPKT = input("Load from existing model (type cpkt path if true): ")
DEBUG = True if int(input('Debug Run (1,0): ')) == 1 else False
BATCH_SIZE = 8 if DEBUG is True else 32
CPKT_PATH = 'checkpoints' if os.path.exists('checkpoints') else '/content/drive/MyDrive/Thesis/checkpoints'

# Types of Bugs distribution in samples

In [None]:
con = sqlite3.connect(DB_PATH)

with open('bug-type-dist-query.sql', 'r') as f:
    query = f.read()

info_df = pd.read_sql_query(query, con)
info_df

# Create Classification Labels

```json
{
    "mobile" : 0,
    "functionality" : 0,
    "ui-ux" : 0,
    "compatibility-performance" : 0,
    "network-security" : 0,
    "general": 0
}
```
Ένα δείγμα που κατηγοριοποιήθηκε ως σφάλμα λειτουργικότητας(functionality) και ui-ux θα έχει διάνυσμα ταξινόμησης ->
```[0,1,1,0,0,0]```


In [None]:
def load_ds() -> pd.DataFrame:
    ds_df = pd.read_sql_query(f"select * from {DB_TABLE}", con)
    return ds_df

ds_df = load_ds()
ds_df['class_labels'] = ds_df['bug_type'].apply(lambda bT: add_labels(bT))
ds_df.head()
old_codes = ds_df[['old_contents', 'class_labels']]
new_codes = ds_df[['new_contents', 'class_labels']]

TRAIN_old, VAL_old, TRAIN_new, VAL_new = train_test_split(old_codes, new_codes, test_size=VAL_SIZE, random_state=42)

print(f"Total training samples: {len(ds_df)}")

con.close()

## Dataset

In [None]:
tokenizer = RobertaTokenizer.from_pretrained(HF_DIR)

TRAIN_encodings = tokenizer(
    TRAIN_old['old_contents'].tolist(),
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

VAL_encodings = tokenizer(
    VAL_old['old_contents'].tolist(),
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

TRAIN_decodings = tokenizer(
    TRAIN_new['new_contents'].tolist(),
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

VAL_decodings = tokenizer(
    VAL_new['new_contents'].tolist(),
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

## Convert Class Labels into tensors

In [None]:
TRAIN_classes = torch.tensor(TRAIN_old['class_labels'].tolist())
VAL_classes = torch.tensor(VAL_old['class_labels'].tolist())

# Initialize Training Settings

In [None]:
logger = init_logger(log_path=LOG_PATH, model_dir=MODEL_DIR, version=VERSION)
checkpoint = init_checkpoint(cpkt_path=CPKT_PATH, model_dir=MODEL_DIR, version=VERSION)
trainer = Trainer(checkpoint,logger,debug=DEBUG, num_epochs=3)

if len(LOAD_FROM_CPKT) > 0 and  os.path.exists(LOAD_FROM_CPKT):
    model = CodeT5.load_from_checkpoint(LOAD_FROM_CPKT)
else:
    model = CodeT5()
model.model.train()

TRAIN_dataset = CodeT5Dataset(TRAIN_encodings, TRAIN_decodings, TRAIN_classes)
VAL_dataset = CodeT5Dataset(VAL_encodings, VAL_decodings, VAL_classes)
dataloader = DataLoader(TRAIN_dataset, batch_size=BATCH_SIZE,num_workers=7, shuffle=True)
val_dataloader = DataLoader(VAL_dataset, batch_size=1, num_workers=7)

In [None]:
trainer.fit(
    model,
    train_dataloaders=dataloader,
    val_dataloaders=val_dataloader
)