In [None]:
!pip install transformers
!pip install datasets
!pip install scikit-learn
!pip install to_pandas
!pip install numpy
!pip install tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd '/content/drive/MyDrive/soft-fault'

In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
#from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
import numpy as np
import random


In [None]:
train_dataset = pd.read_csv("data/train.csv")
val_dataset = pd.read_csv("data/validation.csv")


train_dataset['bug'].describe()

def create_one_hot_encoding(bug_value):
    if bug_value > 0:
        return [0, 1]
    else:
        return [1, 0]

train_dataset['one-hot'] = train_dataset['bug'].apply(lambda x: create_one_hot_encoding(x))
val_dataset['one-hot'] = val_dataset['bug'].apply(lambda x: create_one_hot_encoding(x))

train_dataset.rename(columns={'source_code': 'Code'}, inplace=True)
val_dataset.rename(columns={'source_code': 'Code'}, inplace=True)

train_dataset = train_dataset[train_dataset['Code'] != '']
val_dataset = val_dataset[val_dataset['Code'] != '']

train_dataset = train_dataset.dropna(subset=['Code'])
val_dataset = val_dataset.dropna(subset=['Code'])


In [None]:
train_dataset

In [None]:
EPOCHS = 10
BATCH_SIZE = 12

tokenizer = transformers.AutoTokenizer.from_pretrained("CAUKiel/JavaBERT")

class bugClassifier(nn.Module):
    DROPOUT_PROB = 0.1
    N_CLASSES = 2

    def __init__(self):
        super(bugClassifier, self).__init__()
        self.model = transformers.AutoModel.from_pretrained("CAUKiel/JavaBERT", output_hidden_states=True)
        self.dropout = nn.Dropout(self.DROPOUT_PROB)
        self.linear = nn.Linear(768 * 4, self.N_CLASSES)
        self.step_scheduler_after = "batch"


    def forward(self, ids, mask):
        """Use last four hidden states"""
        all_hidden_states = torch.stack(self.model(ids, attention_mask=mask)["hidden_states"])

        concatenate_pooling = torch.cat(
            (all_hidden_states[-1], all_hidden_states[-2], all_hidden_states[-3], all_hidden_states[-4]),-1
        )

        concatenate_pooling = concatenate_pooling[:, 0]

        output_dropout = self.dropout(concatenate_pooling)

        output = self.linear(output_dropout)
        return output


def get_model():
  model = bugClassifier()
  return model

In [None]:
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
def tokenize_truncate(tokenizer, text_samples, max_length):
    full_input_ids = []

    # For each training example...
    for text in text_samples:
        # Tokenize the sample.
        try:
            input_ids = tokenizer.encode(text=text,              # Text to encode.
                                        add_special_tokens=True, # Do add specials.
                                        max_length=max_length,      # Do Truncate!
                                        truncation=True,         # Do Truncate!
                                        padding=False)           # DO NOT pad.

        except:
            print(text)
            print('-'*50)
        # Add the tokenized result to our list.
        full_input_ids.append(input_ids)

    print('DONE. {:>10,} samples\n'.format(len(full_input_ids)))
    return full_input_ids


def build_batches(samples, batch_size):
    # List of batches that we'll construct.
    batch_ordered_text = []
    batch_ordered_labels = []

    print('Creating batches of size {:}...'.format(batch_size))

    # Loop over all of the input samples...
    while len(samples) > 0:
        # `to_take` is our actual batch size. It will be `batch_size` until
        # we get to the last batch, which may be smaller.
        to_take = min(batch_size, len(samples))

        # Pick a random index in the list of remaining samples to start
        # our batch at.
        select = random.randint(0, len(samples) - to_take)

        # Select a contiguous batch of samples starting at `select`.
        batch = samples[select:(select + to_take)]

        #print("Batch length:", len(batch))

        # Each sample is a tuple--split them apart to create a separate list of
        # sequences and a list of labels for this batch.
        batch_ordered_text.append([s[0] for s in batch])
        batch_ordered_labels.append([s[1] for s in batch])

        # Remove these samples from the list.
        del samples[select:select + to_take]

    print('\t  DONE - Selected {:,} batches.\n'.format(len(batch_ordered_text)))
    return batch_ordered_text, batch_ordered_labels


def add_padding_per_batch(tokenizer, batch_ordered_text, batch_ordered_labels):
    print('Padding out sequences within each batch...')

    final_input_ids = []
    final_attention_masks = []
    final_labels = []

    # For each batch...
    for (batch_inputs, batch_labels) in zip(batch_ordered_text, batch_ordered_labels):
        # New version of the batch, this time with padded sequences and now with
        # attention masks defined.
        batch_padded_inputs = []
        batch_attn_masks = []

        # First, find the longest sample in the batch.
        # Note that the sequences do currently include the special tokens!
        max_size = max([len(sen) for sen in batch_inputs])

        # For each input in this batch...
        for sen in batch_inputs:

            # How many pad tokens do we need to add?
            num_pads = max_size - len(sen)

            # Add `num_pads` padding tokens to the end of the sequence.
            padded_input = sen + [tokenizer.pad_token_id]*num_pads

            # Define the attention mask--it's just a `1` for every real token
            # and a `0` for every padding token.
            attn_mask = [1] * len(sen) + [0] * num_pads

            # Add the padded results to the batch.
            batch_padded_inputs.append(padded_input)
            batch_attn_masks.append(attn_mask)

        # Our batch has been padded, so we need to save this updated batch.
        # We also need the inputs to be PyTorch tensors, so we'll do that here.
        # Todo - Michael's code specified "dtype=torch.long"
        final_input_ids.append(torch.tensor(batch_padded_inputs))
        final_attention_masks.append(torch.tensor(batch_attn_masks))
        final_labels.append(torch.tensor(np.array(batch_labels))) # if there's problems, remove np.array()

    print('\t DONE. Returning final smart-batched data.')
    # Return the smart-batched dataset!
    return (final_input_ids, final_attention_masks, final_labels)


def smart_batching(tokenizer, max_length, text_samples, labels, batch_size):
    # Tokenize and truncate text_samples; no padding
    full_input_ids = tokenize_truncate(tokenizer, text_samples, max_length)

    # Sort the two lists together by the length of the input sequence.
    samples = sorted(zip(full_input_ids, labels), key=lambda x: len(x[0]))

    # Build batches of contiguous data, starting at random points in samples
    batch_size = batch_size
    batch_ordered_text, batch_ordered_labels = build_batches(samples, batch_size)

    # Add padding accordingly to batch size
    final_input_ids, final_attention_masks, final_labels = add_padding_per_batch(tokenizer, batch_ordered_text, batch_ordered_labels)

    return final_input_ids, final_attention_masks, final_labels



In [None]:
train_input_ids, train_attn_masks, train_labels = smart_batching(tokenizer, 512, train_dataset['Code'], train_dataset['one-hot'], BATCH_SIZE)
val_input_ids, val_attn_masks, val_labels = smart_batching(tokenizer, 512, val_dataset['Code'], val_dataset['one-hot'], BATCH_SIZE)


In [None]:
def get_optimizer(model):
    opt = torch.optim.AdamW(model.parameters(),
                  lr = 5e-5,
                  eps = 1e-8
                )

    return opt

def get_scheduler(optimizer, num_train_steps):
    sch = transformers.get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
    return sch

In [None]:
def loss_fn(outputs, labels):
    if labels is None:
        return None
    return nn.BCEWithLogitsLoss()(outputs, labels.float())

In [None]:
from sklearn.metrics import accuracy_score

def getAccuracy(preds, labels):
    prob_preds = torch.stack(preds)
    prob_preds = prob_preds.cpu().detach().numpy()
    flabels = torch.stack(labels)
    flabels = flabels.cpu().detach().numpy()

    label_predictions = np.zeros((len(preds), 23))
    label_predictions = prob_preds >= 0.5
    label_predictions = label_predictions.astype(int)

    # accuracy_score from sklearn calculates subset accuracy
    return accuracy_score(flabels, label_predictions)

In [None]:
import time

def train_fn(train_input_ids, train_attn_masks, train_labels, model, optimizer,scheduler):
    print("Starting training... ")

    update_interval = 500
    t0 = time.time()

    train_loss = 0.0
    model.train()

    final_targets = []
    final_outputs = []

    # for each batch
    for step in range(0, len(train_input_ids)):
        # Progress update every, e.g., 100 batches.
        if step % update_interval == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Calculate the time remaining based on our progress.
            steps_per_sec = (time.time() - t0) / step
            remaining_sec = steps_per_sec * (len(train_input_ids) - step)
            remaining = format_time(remaining_sec)

            # Report progress.
            print('  Batch {:>7,}  of  {:>7,}.    Elapsed: {:}.  Remaining: {:}'.format(step, len(train_input_ids), elapsed, remaining))

        ids = train_input_ids[step].to('cuda', dtype = torch.long)
        mask = train_attn_masks[step].to('cuda', dtype = torch.long)
        targets = train_labels[step].to('cuda', dtype = torch.float)

        optimizer.zero_grad()

        outputs = model(ids=ids, mask=mask)

        loss = loss_fn(outputs, targets)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
        scheduler.step()

        final_targets.extend(targets)
        final_outputs.extend(torch.sigmoid(outputs))


    return train_loss, final_outputs, final_targets


In [None]:
import time
def eval_fn(test_input_ids, test_attn_masks, test_labels, model):
    print('\nStarting evaluation... ')

    update_interval = 100
    t0 = time.time()

    eval_loss = 0.0

    model.eval()

    final_targets = []
    final_outputs = []

    with torch.no_grad():
       for step in range(0, len(test_input_ids)):
          if step % update_interval == 0 and not step == 0:
              # Calculate elapsed time in minutes.
              elapsed = format_time(time.time() - t0)

              # Calculate the time remaining based on our progress.
              steps_per_sec = (time.time() - t0) / step
              remaining_sec = steps_per_sec * (len(test_input_ids) - step)
              remaining = format_time(remaining_sec)
              # Report progress.
              print('  Batch {:>7,}  of  {:>7,}.    Elapsed: {:}.  Remaining: {:}'.format(step, len(test_input_ids), elapsed, remaining))

          ids = test_input_ids[step].to('cuda', dtype = torch.long)
          mask = test_attn_masks[step].to('cuda', dtype = torch.long)
          targets = test_labels[step].to('cuda', dtype = torch.float)

          outputs = model(ids=ids, mask=mask)

          loss = loss_fn(outputs, targets)

          eval_loss += loss.item()
          final_targets.extend(targets)
          final_outputs.extend(torch.sigmoid(outputs))

    return eval_loss, final_outputs, final_targets

In [None]:
def save_checkpoint(epoch, optimizer, scheduler, model, train_loss, test_loss):
  torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'train_loss': train_loss,
            'test_loss': test_loss
            }, "models/model.bin")

  print("Saved hs_checkpoint_" + str(epoch) + ".bin")

In [None]:
from tqdm import tqdm

n_train_steps = len(train_input_ids) * EPOCHS

model = get_model()
model.to('cuda')

optimizer = get_optimizer(model)
scheduler = get_scheduler(optimizer, n_train_steps)

# Load checkpoint from a previous model
# checkpoint = torch.load('/content/drive/MyDrive/path/in/drive/model.bin')
# model.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
# best_eval_loss = checkpoint['test_loss']
# epoch = checkpoint['epoch'] + 1
epoch = 0

In [None]:
while epoch < EPOCHS:
    print("\t\t epoch: ", epoch)

    if epoch > 0: # This is not cross-validation!
      train_input_ids, train_attn_masks, train_labels = smart_batching(tokenizer, 512, train_dataset['Code'], train_dataset['one-hot'], 12)
      val_input_ids, val_attn_masks, val_labels = smart_batching(tokenizer, 512, val_dataset['Code'], val_dataset['one-hot'], 12)

    train_loss, train_preds, train_true_labels = train_fn(train_input_ids, train_attn_masks, train_labels, model, optimizer, scheduler)
    eval_loss, eval_preds, eval_true_labels = eval_fn(val_input_ids, val_attn_masks, val_labels, model)

    avg_train_loss, avg_val_loss = train_loss / len(train_input_ids), eval_loss / len(val_input_ids)
    train_acc = getAccuracy(train_preds, train_true_labels)
    eval_acc = getAccuracy(eval_preds, eval_true_labels)

    train_info = "Avg Train loss (loss/batch): " + str(avg_train_loss) +  "\t Train accuracy: " + str(train_acc) + "\n"
    val_info = "Avg Valid loss (loss/batch): " + str(avg_val_loss) + "\t Validation accuracy: " + str(eval_acc) + "\n\n"

    f = open("/content/loss.txt", "a")
    f.write(train_info)
    f.write(val_info)
    print(train_info)
    print(val_info)
    f.close()

    scheduler.step()
    save_checkpoint(epoch, optimizer, scheduler, model, train_loss, eval_loss)
    epoch = epoch + 1



From this cell onwards, we just test the final checkpoint:

In [None]:
from tqdm import tqdm

model = get_model()
model.to('cuda')

# Load best model iteration for testing
checkpoint = torch.load('/content/drive/MyDrive/path/in/drive/model.bin')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
epoch = checkpoint['epoch']
best_eval_loss = checkpoint['test_loss']


In [None]:
from ast import literal_eval

test_dataset = pd.read_csv("/content/ds.csv")
test_dataset['one-hot'] = test_dataset['one-hot'].apply(literal_eval)

test_input_ids, test_attn_masks, test_labels = smart_batching(tokenizer, 512, test_dataset['code'], test_dataset['one-hot'], BATCH_SIZE)

test_loss, test_preds, test_true_labels = eval_fn(test_input_ids, test_attn_masks, test_labels, model)

In [None]:
test_accuracy = getAccuracy(test_preds, test_true_labels)

#subset accuracy
print(test_accuracy)

In [None]:
from sklearn.metrics import multilabel_confusion_matrix

prob_preds = torch.stack(test_preds)
prob_preds = prob_preds.cpu().detach().numpy()

label_predictions = (prob_preds > 0.5).astype(int)

flabels = torch.stack(test_true_labels)
flabels = flabels.cpu().detach().numpy()


# Build confusion matrix
cf_matrix = multilabel_confusion_matrix(flabels, label_predictions) #use this to get FP, FN, TP, TN to calculate accuracy

cf_matrix

In [None]:
from sklearn.metrics import classification_report
cr = classification_report(flabels, label_predictions)
print(cr)