In [1]:
%pip install transformers
import copy
import torch
import random
import numpy as np
import pandas as pd
import math
from torch.utils.data import TensorDataset
from sklearn.model_selection import KFold
from transformers import BertTokenizer, get_linear_schedule_with_warmup, BertForSequenceClassification, logging
logging.set_verbosity_error()

# **Global Configuration**

In [2]:
SEED = 20 # Global seed value for reproducibility
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Conventional data loading settings
TRAINING_PORTION = 0.8
TESTING_PORTION = 0.2
BATCH_SIZE = 32
NUM_EPOCHS = 6

# K-Fold cross validation settings
N_SPLITS = int(1/(TESTING_PORTION))
KFOLD_SHUFFLE = True

PROJECT_DIR = '/content/drive/MyDrive/BERT_SIZE_MEASUREMENT'
MODEL_NAME = "bert-base-uncased"

if MODEL_NAME == 'bert-base-uncased':
    MODEL_PATH = MODEL_NAME
    TOKENIZER_PATH = MODEL_NAME
else:
    MODEL_PATH = PROJECT_DIR + '/' + MODEL_NAME
    TOKENIZER_PATH = PROJECT_DIR + '/' + MODEL_NAME

DATA_DIR = PROJECT_DIR + '/data'
DATASET_FILE = DATA_DIR + '/UserStory_1007.csv'
TARGET_COL = 'cosmic_read'
INPUT_COL = 'sample_description'
SAVE_METRIC = 'ACCURACY' # models will be saved based on this metric (ACCURACY or PRED30 or MSE)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Initialize the CUDA device**

In [None]:
if torch.cuda.is_available():
  device = torch.device('cuda')
  print('Number of available GPUs: {}'.format(torch.cuda.device_count()))
  print('Active GPU: {}'.format(torch.cuda.get_device_name(0)))
else:
  device = torch.device('cpu')
  print('GPU resource was not found. Using CPU resource.')

# **Model Preparation**

In [None]:
def initialize_model():
    return BertForSequenceClassification.from_pretrained(MODEL_PATH, num_labels = 1)

model = initialize_model()
model.to(device)

# **Dataset Preparation**

In [None]:
# Load dataset file as pandas dataframe
dataset = pd.read_csv(DATASET_FILE, delimiter=',')
print("Total Samples: {}".format(dataset.shape[0]))

dataset = dataset[[INPUT_COL, TARGET_COL]]
dataset[TARGET_COL] = dataset[TARGET_COL].fillna(0)  # Fill NA values with zero
dataset[TARGET_COL] = dataset[TARGET_COL].astype(int)
print("Total Valid Samples: {}".format(len(dataset)))

display(dataset)

result_dataset = dataset.copy()
result_dataset["prediction"] = None


**Split the description and measurement columns for processing**

In [None]:
inputs = dataset[INPUT_COL].values.astype("str")
# Convert line breaks to the spaces
inputs = [input.strip().replace("\n"," ") for input in inputs]
targets = dataset[TARGET_COL].values
CALCULATE_MRE = all(dataset[TARGET_COL] > 0)
print("Calculate MRE =", CALCULATE_MRE)

**Encode samples and create a TensorDataset**

In [7]:
tokenizer = BertTokenizer.from_pretrained(TOKENIZER_PATH)

# Find the max sample length
max_input_length = 0
for input in inputs:
  input_ids = tokenizer.encode(input, add_special_tokens=True)
  max_input_length = max(max_input_length, len(input_ids))

print("Max input length: {}".format(max_input_length))

Max input length: 95


In [8]:
input_ids = []
attention_masks = []

for input in inputs:
  encoded_input = tokenizer.encode_plus(input,
                                         add_special_tokens=True,
                                         max_length=max_input_length,
                                         truncation=True,
                                         padding="max_length",
                                         return_attention_mask=True,
                                         return_tensors='pt',)
  input_ids.append(encoded_input["input_ids"])
  attention_masks.append(encoded_input["attention_mask"])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(targets, dtype=torch.float)
dataset = TensorDataset(input_ids, attention_masks, labels)

# **Training & Evaluation Functions**

In [9]:
def train(model, optimizer, scheduler, train_loader):
    model.train()
    total_loss = 0
    for batch in train_loader:
        model.zero_grad()
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        output = model(input_ids, attention_mask=attention_mask, labels=labels)
        total_loss += output.loss.item()
        output.loss.backward()
        optimizer.step()
        scheduler.step()
    return total_loss / len(train_loader)

def evaluate(model, data_loader, criterion):
    model.eval()
    total_loss = 0
    total_accuracy = 0
    label_data = []
    prediction_data = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
            output = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = criterion(output.logits.squeeze(-1), labels.float())
            total_loss += loss.item()
            logits = output.logits.to('cpu').numpy()
            label_ids = labels.to('cpu').numpy()
            label_data += label_ids.tolist()
            prediction_data += logits.flatten().tolist()

    return label_data, prediction_data, total_loss / len(data_loader)

In [None]:
folder_name = MODEL_NAME + "-" + DATASET_FILE.split("/")[-1].split(".")[0] + "-" + TARGET_COL
results_folder = PROJECT_DIR + "/results/" + folder_name
print(results_folder)
!mkdir -p {results_folder}

# **Run Cross-Validation**

In [None]:
kf = KFold(n_splits=N_SPLITS, random_state=SEED, shuffle=KFOLD_SHUFFLE)

out = ""

model_name = "              Model Name: {}".format(MODEL_NAME)
out += model_name+"\n"
print(model_name)

number_of_splits = "        Number of Splits: {}".format(N_SPLITS)
out += number_of_splits+"\n"
print(number_of_splits)

shuffle = "                 Shuffle: {}".format(KFOLD_SHUFFLE)
out += shuffle+"\n"
print(shuffle)

pred30s = []
mres = []
mses = []
maes = []
nmaes = []

overall_min_mse = 999999
overall_max_pred30 = -1
overall_max_accuracy = -1
overall_best_model = None

for fold, (train_index, test_index) in enumerate(kf.split(dataset)):
  # RESET WEIGHTS IN EACH FOLD:
  model = initialize_model().to(device)
  criterion = torch.nn.MSELoss()
  optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, eps=1e-8)

  # GENERATE DATALOADERS IN EACH FOLD:
  train_subset = torch.utils.data.Subset(dataset, train_index)
  test_subset = torch.utils.data.Subset(dataset, test_index)
  train_dataloader = torch.utils.data.DataLoader(train_subset, batch_size=BATCH_SIZE, shuffle=True)
  test_dataloader = torch.utils.data.DataLoader(test_subset, batch_size=BATCH_SIZE, shuffle=False)

  total_steps = len(train_dataloader) * NUM_EPOCHS
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

  fold_text = "\n                    FOLD: {}\n".format(fold+1)
  out += fold_text + "\n"
  print(fold_text)

  fold_min_mse = 999999
  fold_max_pred30 = -1
  fold_max_accuracy = -1
  results = []

  for epoch in range(NUM_EPOCHS):
      train_loss = train(model, optimizer, scheduler, train_dataloader)
      labels, preds, test_loss = evaluate(model, test_dataloader, criterion)

      result = pd.DataFrame.from_dict({"label": labels, "prediction": preds})
      ses = [math.pow(preds[i]-labels[i], 2) for i in range(len(preds))]
      aes = [np.abs(preds[i]-labels[i]) for i in range(len(preds))]
      mse = np.mean(ses)
      mae = np.mean(aes)
      nmae = mae / np.mean(labels)
      rounded_preds = [round(pred) for pred in preds]
      accuracy = sum([1 for i in range(len(preds)) if rounded_preds[i] == labels[i]]) / len(preds)
      result = {"epoch": epoch+1, "train_loss": train_loss, "test_loss": test_loss, "mse": mse, "mae": mae, "nmae": nmae, "accuracy": accuracy}
      epoch_text = f"Epoch {epoch+1}:  train_loss={train_loss:.4f}  test_loss={test_loss:.4f}  test_mae={mae:.4f}  test_nmae={nmae:.4f}  test_mse={mse:.4f}  test_accuracy={accuracy:.4f}"


      if CALCULATE_MRE:
        res = [(np.abs(preds[i]-labels[i]) / labels[i]) for i in range(len(preds))]
        resUnder30 = [re for re in res if re < 0.3]
        pred30 = len(resUnder30)/len(res)
        mre = np.mean(res)
        result["mre"] = mre
        result["pred30"] = pred30
        epoch_text += f"  test_mre={mre:.4f}  test_pred30={pred30:.4f}"

      if CALCULATE_MRE and SAVE_METRIC == "PRED30":
        # save model based on PRED30
        if pred30 > fold_max_pred30:
          # use the predictions of this epoch
          fold_max_pred30 = pred30
          for i, index in enumerate(test_index):
            result_dataset.at[index, "prediction"] = preds[i]

        if pred30 > overall_max_pred30:
          # select this epoch as the best model
          overall_max_pred30 = pred30
          overall_best_model = copy.deepcopy(model)
          print(f"New overall max PRED30 ({pred30}). Saving as the best model...")

      elif SAVE_METRIC == "MSE":
        # save model based on MSE
        if mse < fold_min_mse:
          fold_min_mse = mse
          for i, index in enumerate(test_index):
            result_dataset.at[index, "prediction"] = preds[i]
        if test_loss < overall_min_mse:
          overall_min_mse = test_loss
          overall_best_model = copy.deepcopy(model)
          print(f"New overall min MSE ({mse}). Saving as the best model...")
      
      elif SAVE_METRIC == "ACCURACY":
        # save model based on accuracy
        if accuracy > fold_max_accuracy:
          fold_max_accuracy = accuracy
          for i, index in enumerate(test_index):
            result_dataset.at[index, "prediction"] = preds[i]
        if accuracy > overall_max_accuracy:
          overall_max_accuracy = accuracy
          overall_best_model = copy.deepcopy(model)
          print(f"New overall max accuracy ({accuracy}). Saving as the best model...")

      results.append(result)
      out += epoch_text + "\n"
      print(epoch_text)

  if CALCULATE_MRE:
    max_pred30 = 0
    max_index = -1
    for i, result in enumerate(results):
      if result["pred30"] > max_pred30:
        max_pred30 = result["pred30"]
        max_index = i

    pred30s.append(results[max_index]["pred30"])
    mres.append(results[max_index]["mre"])
    mses.append(results[max_index]["mse"])
    maes.append(results[max_index]["mae"])
    nmaes.append(results[max_index]["nmae"])
  else:
    min_mse = 0
    min_index = -1
    for i, result in enumerate(results):
      if result["mse"] < min_mse:
        min_mse = result["mse"]
        min_index = i
    mses.append(results[min_index]["mse"])
    maes.append(results[min_index]["mae"])
    nmaes.append(results[min_index]["nmae"])

## PRINT AVERAGES

In [None]:
out_footer = ""
print()

mae_text= f"   Average MAE: {np.mean(maes):.4f}"
out_footer += mae_text+"\n"
nmae_text= f"  Average NMAE: {np.mean(nmaes):.4f}"
out_footer += nmae_text+"\n"
mse_text= f"   Average MSE: {np.mean(mses):.4f}"
out_footer += mse_text+"\n"
mae_over_mean_text= f"      MAE/Mean: {np.mean(maes)/np.mean(targets):.4f}"
out_footer += mae_over_mean_text+"\n"

if CALCULATE_MRE:
  mre_text= f"   Average MRE: {np.mean(mres):.4f}"
  out_footer += mre_text+"\n"
  pred30_text= f"Average PRED30: {np.mean(pred30s):.4f}"
  out_footer += pred30_text+"\n"

predictions = result_dataset["prediction"].values
rounded_predictions = [round(pred) for pred in predictions]
actuals = result_dataset[TARGET_COL].values
accuracy = np.mean(rounded_predictions == actuals)
accuracy_text = f"      Accuracy: {accuracy:.4f}"
out_footer += accuracy_text

print("         Model:", MODEL_NAME)
print("        Target:", TARGET_COL)
print("  Dataset Size:", len(dataset))
print("  Num of Folds:", N_SPLITS)
print(out_footer)


In [None]:
f = open(results_folder+"/result.txt", "w+")
f.write(out + out_footer)
f.close()
print("RUN FINISHED...")

In [None]:
result_dataset.to_csv(results_folder + "/predictions.csv", index=False)
print("PREDICTIONS SAVED FINISHED...")
overall_best_model.save_pretrained(results_folder + "/{}_{:.4f}".format(folder_name, accuracy))
print("MODEL SAVED FINISHED...")

## Merge all predictions.csv under /results/*


In [None]:
import os
import pandas as pd
PROJECT_DIR = '.'
results_folder = PROJECT_DIR + "/results"
folders = os.listdir(results_folder)
folders = [folder for folder in folders if folder.startswith("BERT") or folder.startswith("bert")]
folders = sorted(folders)
for f in folders:
    print(f)

dfs = []
for folder in folders:
  folder_path = results_folder + "/" + folder
  prediction_file = folder_path + "/predictions.csv"
  df = pd.read_csv(prediction_file)
  # get only the predictions column
  df = df.iloc[:,2]
  dfs.append(df)

new_col_names = []
for f in folders:
    new_col_names.append(f.split("UserStory_1007-")[0] + f.split("UserStory_1007-")[1])

merged_df = pd.concat(dfs, axis=1)
merged_df.columns = new_col_names
merged_df.to_csv(results_folder + "/merged_predictions.csv", index=False)
print("MERGED PREDICTIONS SAVED FINISHED...")
