# Training

In [1]:
import os
from transformers import RobertaTokenizer
from sklearn.model_selection import train_test_split
from modules.TrainConfig import init_checkpoint, init_logger, Trainer, masker
from modules.models import CodeBertJS
from modules.datasets import CodeBertDataset
from modules.filters import mask_code_diff, add_labels
from torch.utils.data import DataLoader
import sqlite3
import pandas as pd
import json
import os
import torch

In [2]:
HF_DIR = 'microsoft/codebert-base-mlm'
TOKENIZER_MAX_LENGTH = 512
DB_PATH = 'commitpack-datasets.db' if os.path.exists('commitpack-datasets.db') else '/content/drive/MyDrive/Thesis/commitpack-datasets.db'
DB_TABLE = 'commitpackft_classified_train'
if not os.path.exists(DB_PATH):
    raise RuntimeError('sqlite3 path doesnt exist.')
VAL_SIZE = 0.3
LOG_PATH = 'logs' if os.path.exists('logs') else '/content/drive/MyDrive/Thesis/logs'
VERSION = int(input('Training version: '))
LOAD_FROM_CPKT = input("Load from existing model (type cpkt path if true): ")
DEBUG = True if int(input('Debug Run (1,0): ')) == 1 else False
BATCH_SIZE = 2 if DEBUG else 32
CPKT_PATH = CPKT_PATH = 'checkpoints' if os.path.exists('checkpoints') else '/content/drive/MyDrive/Thesis/checkpoints'
DROPOUT_RATE = float(input('Type dropout rate for classifier: '))
WITH_MOBILE = True if int(input('Consider mobile class (1,0): ')) == 1 else False
WITH_LAYER_NORM = True
WITH_ACTIVATION = True

if WITH_MOBILE:
    classLabels = {
        "mobile" : 0.,
        "functionality" : 0.,
        "ui-ux" : 0.,
        "compatibility-performance" : 0.,
        "network-security" : 0.,
        "general": 0.
    }
else:
    classLabels = {
        "functionality" : 0.,
        "ui-ux" : 0.,
        "compatibility-performance" : 0.,
        "network-security" : 0.,
        "general": 0.
    }

num_classes = len(classLabels.keys())

modelSize = HF_DIR.split('/')[-1]
MODEL_DIR = f"CodeBert_{modelSize}_JS_{num_classes}_classes_{TOKENIZER_MAX_LENGTH}MaxL"
con = sqlite3.connect(DB_PATH)


## Types of Bugs distribution in samples

In [3]:
with open('bug-type-dist-query_train.sql', 'r') as f:
    query = f.read()

info_df = pd.read_sql_query(query, con)
info_df

Unnamed: 0,count(*),bug_type
0,90,mobile
1,2862,general
2,3147,ui-ux
3,3159,network-security
4,4396,compatibility-performance
5,4532,functionality


## Create Classification Labels
```json
{
    "mobile" : 0,
    "functionality" : 0,
    "ui-ux" : 0,
    "compatibility-performance" : 0,
    "network-security" : 0,
    "general": 0
}

Ένα δείγμα που κατηγοριοποιήθηκε ως σφάλμα λειτουργικότητας(functionality) και ui-ux θα έχει διάνυσμα ταξινόμησης ->
[0,1,1,0,0,0]
```

In [4]:
tokenizer = RobertaTokenizer.from_pretrained(HF_DIR)


def load_ds() -> pd.DataFrame:
    query = f"select * from {DB_TABLE}"
    ds_df = pd.read_sql_query(query, con)
    return ds_df

ds_df = load_ds()

ds_df['class_labels'] = ds_df['bug_type'].apply(lambda bT: add_labels(bT.split(','), classLabels))
if DEBUG:
    ds_df = ds_df.iloc[:10]

if not WITH_MOBILE:
    ds_df = ds_df[ds_df['bug_type'] != 'mobile']

## Mask Input Sequences

In [5]:
# ds_df['masked_old_contents'] = ds_df.apply(lambda row: mask_code_diff(row['old_contents'], row['new_contents'], tokenizer), axis=1)
# ds_df.head()

In [6]:
old_codes = ds_df[['message', 'old_contents', 'class_labels']]
old_codes['input_seq'] = old_codes['message'] + ' ' + tokenizer.sep_token + ' ' + old_codes['old_contents']
new_codes = ds_df[['message', 'new_contents', 'class_labels']]

TRAIN_old, VAL_old, TRAIN_new, VAL_new = train_test_split(old_codes, new_codes, test_size=VAL_SIZE, random_state=42)

print(f"Total training samples: {len(ds_df)}")

Total training samples: 10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  old_codes['input_seq'] = old_codes['message'] + ' ' + tokenizer.sep_token + ' ' + old_codes['old_contents']


In [7]:
TRAIN_masked, VAL_masked, TRAIN_gt, VAL_gt = train_test_split(ds_df['old_contents'].tolist(), ds_df['new_contents'].tolist(), test_size=VAL_SIZE, random_state=42)

TRAIN_encodings = tokenizer(
    TRAIN_masked,
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

VAL_encodings = tokenizer(
    VAL_masked,
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

TRAIN_GT_input_ids = tokenizer(
    TRAIN_gt,
    max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
).input_ids

VAL_GT_input_ds = tokenizer(
    VAL_gt,
        max_length=TOKENIZER_MAX_LENGTH,
    pad_to_max_length=True,
    return_tensors='pt',
    padding='max_length',
    truncation=True
).input_ids

In [8]:
TRAIN_classes = torch.tensor(TRAIN_old['class_labels'].tolist())
VAL_classes = torch.tensor(VAL_old['class_labels'].tolist())
TRAIN_classes

tensor([[1., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1.],
        [0., 1., 1., 0., 0.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1.]])

In [9]:
num_samples = TRAIN_classes.size(0)
num_classes = TRAIN_classes.size(1)

pos_counts = torch.sum(TRAIN_classes, dim=0)
neg_counts = num_samples - pos_counts
class_weights = neg_counts / (pos_counts + 1e-6)
class_weights = class_weights.numpy()

In [10]:
logger = init_logger(log_path=LOG_PATH, model_dir=MODEL_DIR, version=VERSION)
checkpoint = init_checkpoint(cpkt_path=CPKT_PATH, model_dir=MODEL_DIR, version=VERSION)
trainer = Trainer(checkpoint=checkpoint,logger=logger,debug=DEBUG, num_epochs=7)

if len(LOAD_FROM_CPKT) > 0 and  os.path.exists(LOAD_FROM_CPKT):
    model = CodeBertJS.load_from_checkpoint(
        LOAD_FROM_CPKT, 
        class_weights=class_weights,
        num_classes=num_classes,
        dropout_rate=DROPOUT_RATE,
        with_activation=WITH_ACTIVATION,
        with_layer_norm=WITH_LAYER_NORM,
        tokenizer=tokenizer
    )
else:
    model = CodeBertJS(
        class_weights=class_weights,
        num_classes=num_classes,
        dropout_rate=DROPOUT_RATE,
        with_activation=WITH_ACTIVATION,
        with_layer_norm=WITH_LAYER_NORM,
        tokenizer=tokenizer
    )
model.encoder.train()


TRAIN_dataset = CodeBertDataset(TRAIN_encodings, TRAIN_GT_input_ids, TRAIN_classes)
VAL_dataset = CodeBertDataset(VAL_encodings, VAL_GT_input_ds, VAL_classes)
dataloader = DataLoader(TRAIN_dataset, batch_size=BATCH_SIZE,num_workers=14, shuffle=True)
val_dataloader = DataLoader(VAL_dataset, batch_size=1, num_workers=14)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.
Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
trainer.fit(
    model,
    train_dataloaders=dataloader,
    val_dataloaders=val_dataloader
)

/home/disras/miniconda3/envs/thesis/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/disras/projects/JSRepair/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type               | Params | Mode 
------------------------------------------------------------
0 | encoder      | RobertaForMaskedLM | 124 M  | train
1 | layer_norm   | LayerNorm          | 1.5 K  | train
2 | hidden_layer | Linear             | 590 K  | train
3 | activation   | ReLU               | 0      | train
4 | dropout      | Dropout            | 0      | train
5 | classifier   | Linear             | 3.8 K  | train
------------------------------------------------------------
125 M     Trainable params
0         Non-trainable params
125 M     Total params
501.174   Total estimated model params size (MB)
235       Modules in train mode
0         Modules in eval mode


Training: |          | 0/? [00:00<?, ?it/s]



Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=1` reached.
