In [1]:
# %pip install transformers
import copy
import torch
import random
import numpy as np
import pandas as pd
import math
from torch.utils.data import TensorDataset
from sklearn.model_selection import KFold
from transformers import BertTokenizer, get_linear_schedule_with_warmup, BertForSequenceClassification, logging
logging.set_verbosity_error()

# **Global Configuration**

In [2]:
USE_CV = True # True => Use K-Fold cross validation split | False => Use conventional train/validate/test split
SEED = 20 # Global seed value for reproducibility

# Conventional data loading settings
TRAINING_PORTION = 0.8
TESTING_PORTION = 0.2
BATCH_SIZE = 32
NUM_EPOCHS = 10

# K-Fold cross validation settings
N_SPLITS = int(1/(TESTING_PORTION))
KFOLD_SHUFFLE = True


PROJECT_DIR = '.'
MODEL_NAME = "SE-BERT" # bert-base-uncased or dbmdz/bert-base-turkish-uncased or BERT-SE or SE-BERT or SE-BERTurk

if MODEL_NAME == 'bert-base-uncased' or MODEL_NAME == 'dbmdz/bert-base-turkish-uncased':
    MODEL_PATH = MODEL_NAME
    TOKENIZER_PATH = MODEL_NAME
else:
    MODEL_PATH = PROJECT_DIR + '/' + MODEL_NAME
    TOKENIZER_PATH = PROJECT_DIR + '/' + MODEL_NAME

DATA_DIR = PROJECT_DIR + '/data'
DATASET_FILE = DATA_DIR + '/UserStory_1007.csv'
TRAINING_DATA_TYPE = "Cosmic Point"
TARGET_COL = 'cosmic_exit'
INPUT_COL = 'sample_description' # 'sample_description' or 'TEXT_TR'
SAVE_METRIC = 'ACCURACY' # models will be saved based on this metric (ACCURACY or PRED30 or MSE)

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

**Initialize the CUDA device**

In [4]:
if torch.cuda.is_available():
  device = torch.device('cuda')
  print('Number of available GPUs: {}'.format(torch.cuda.device_count()))
  print('Active GPU: {}'.format(torch.cuda.get_device_name(0)))
else:
  device = torch.device('cpu')
  print('GPU resource was not found. Using CPU resource.')

Number of available GPUs: 1
Active GPU: NVIDIA RTX 6000 Ada Generation


# **Dataset Preparation**

In [5]:
# Load dataset file as pandas dataframe
dataset = pd.read_csv(DATASET_FILE, delimiter=',')
print("Total Samples: {}".format(dataset.shape[0]))

# Count Nan values in TARGET_COL
print("Total Nan Values in TARGET_COL: {}".format(dataset[TARGET_COL].isnull().sum()))
# Fill Nan values in TARGET_COL with 0
dataset[TARGET_COL] = dataset[TARGET_COL].fillna(0)
print("Total Nan Values in TARGET_COL: {}".format(dataset[TARGET_COL].isnull().sum()))

dataset = dataset[[INPUT_COL, TARGET_COL]]
dataset = dataset.dropna(axis=0, subset=[TARGET_COL]).reset_index(drop=True)
dataset[TARGET_COL] = dataset[TARGET_COL].astype(int)
print("Total Valid Samples: {}".format(len(dataset)))

NUM_LABELS = len(dataset[TARGET_COL].unique())
print("Number of Unique Labels: {}".format(NUM_LABELS))
display(dataset)

result_dataset = dataset.copy()
result_dataset["prediction"] = None


Total Samples: 1007
Total Nan Values in TARGET_COL: 0
Total Nan Values in TARGET_COL: 0
Total Valid Samples: 1007
Number of Unique Labels: 4


Unnamed: 0,sample_description,cosmic_exit
0,"As a anonymoususer, I want to view a list of s...",1
1,"As a anonymoususer, I want to view a list of u...",1
2,"As a trainingcoordinator, I want to email all ...",1
3,"As a attendee, I want to have a very clear map...",1
4,"As a trainer, I want to edit my training node ...",0
...,...,...
1002,"As a website user, I want to see updated finan...",1
1003,"As a stakeholder, I want to see the results of...",1
1004,"As a team member, I want to have a prioritized...",1
1005,"As a content editor, I want to be able to easi...",1


**Split the description and measurement columns for processing**

In [6]:
inputs = dataset[INPUT_COL].values.astype("str")
# Convert line breaks to the spaces
inputs = [input.strip().replace("\n"," ") for input in inputs]
targets = dataset[TARGET_COL].values
CALCULATE_MRE = all(dataset[TARGET_COL] > 0)
print("Calculate MRE =", CALCULATE_MRE)

Calculate MRE = False


# **Model Preparation**

In [7]:
def initialize_model():
    return BertForSequenceClassification.from_pretrained(MODEL_PATH, num_labels = 1)

model = initialize_model()
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

**Encode samples and create a TensorDataset**

In [8]:
tokenizer = BertTokenizer.from_pretrained(TOKENIZER_PATH)

# Find the max sample length
max_input_length = 0
for input in inputs:
  input_ids = tokenizer.encode(input, add_special_tokens=True)
  max_input_length = max(max_input_length, len(input_ids))

print("Max input length: {}".format(max_input_length))

Max input length: 95


In [9]:
input_ids = []
attention_masks = []

for input in inputs:
  encoded_input = tokenizer.encode_plus(input,
                                         add_special_tokens=True,
                                         max_length=max_input_length,
                                         truncation=True,
                                         padding="max_length",
                                         return_attention_mask=True,
                                         return_tensors='pt',)
  input_ids.append(encoded_input["input_ids"])
  attention_masks.append(encoded_input["attention_mask"])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(targets, dtype=torch.float)
dataset = TensorDataset(input_ids, attention_masks, labels)

# **Training & Evaluation Functions**

In [10]:
def train(model, optimizer, scheduler, train_loader):
    model.train()
    total_loss = 0
    for batch in train_loader:
        model.zero_grad()
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        output = model(input_ids, attention_mask=attention_mask, labels=labels)
        total_loss += output.loss.item()
        output.loss.backward()
        optimizer.step()
        scheduler.step()
    return total_loss / len(train_loader)

def evaluate(model, data_loader, criterion):
    model.eval()
    total_loss = 0
    total_accuracy = 0
    label_data = []
    prediction_data = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
            output = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = criterion(output.logits.squeeze(-1), labels.float())
            total_loss += loss.item()
            logits = output.logits.to('cpu').numpy()
            label_ids = labels.to('cpu').numpy()
            label_data += label_ids.tolist()
            prediction_data += logits.flatten().tolist()

    return label_data, prediction_data, total_loss / len(data_loader)

In [11]:
folder_name = MODEL_NAME + "-" + DATASET_FILE.split("/")[-1].split(".")[0] + "-" + TARGET_COL
results_folder = PROJECT_DIR + "/results/" + folder_name
print(results_folder)
%mkdir -p {results_folder}

./results/SE-BERT-UserStory_1007-cosmic_exit


# **Run Cross-Validation**

In [12]:
kf = KFold(n_splits=N_SPLITS, random_state=SEED, shuffle=KFOLD_SHUFFLE)

out = ""

model_name = "              Model Name: {}".format(MODEL_NAME)
out += model_name+"\n"
print(model_name)

training_scope = "          Training Scope: {}".format(TRAINING_DATA_TYPE)
out += training_scope+"\n"
print(training_scope)

number_of_splits = "        Number of Splits: {}".format(N_SPLITS)
out += number_of_splits+"\n"
print(number_of_splits)

shuffle = "                 Shuffle: {}".format(KFOLD_SHUFFLE)
out += shuffle+"\n"
print(shuffle)

pred30s = []
mres = []
mses = []
maes = []
nmaes = []

overall_min_mse = 999999
overall_max_pred30 = -1
overall_max_accuracy = -1
overall_best_model = None

for fold, (train_index, test_index) in enumerate(kf.split(dataset)):
  # RESET WEIGHTS IN EACH FOLD:
  model = initialize_model().to(device)
  criterion = torch.nn.MSELoss()
  optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, eps=1e-8)

  # GENERATE DATALOADERS IN EACH FOLD:
  train_subset = torch.utils.data.Subset(dataset, train_index)
  test_subset = torch.utils.data.Subset(dataset, test_index)
  train_dataloader = torch.utils.data.DataLoader(train_subset, batch_size=BATCH_SIZE, shuffle=True)
  test_dataloader = torch.utils.data.DataLoader(test_subset, batch_size=BATCH_SIZE, shuffle=False)

  total_steps = len(train_dataloader) * NUM_EPOCHS
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

  fold_text = "\n                    FOLD: {}\n".format(fold+1)
  out += fold_text + "\n"
  print(fold_text)

  fold_min_mse = 999999
  fold_max_pred30 = -1
  fold_max_accuracy = -1
  results = []

  for epoch in range(NUM_EPOCHS):
      train_loss = train(model, optimizer, scheduler, train_dataloader)
      labels, preds, test_loss = evaluate(model, test_dataloader, criterion)

      result = pd.DataFrame.from_dict({"label": labels, "prediction": preds})
      ses = [math.pow(preds[i]-labels[i], 2) for i in range(len(preds))]
      aes = [np.abs(preds[i]-labels[i]) for i in range(len(preds))]
      mse = np.mean(ses)
      mae = np.mean(aes)
      nmae = mae / np.mean(labels)
      rounded_preds = [round(pred) for pred in preds]
      accuracy = sum([1 for i in range(len(preds)) if rounded_preds[i] == labels[i]]) / len(preds)
      result = {"epoch": epoch+1, "train_loss": train_loss, "test_loss": test_loss, "mse": mse, "mae": mae, "nmae": nmae, "accuracy": accuracy}
      epoch_text = f"Epoch {epoch+1:02d}:  train_loss={train_loss:.4f}  test_loss={test_loss:.4f}  mae={mae:.4f}  nmae={nmae:.4f}  mse={mse:.4f}  acc={accuracy:.4f}"


      if CALCULATE_MRE:
        res = [(np.abs(preds[i]-labels[i]) / labels[i]) for i in range(len(preds))]
        resUnder30 = [re for re in res if re < 0.3]
        pred30 = len(resUnder30)/len(res)
        mre = np.mean(res)
        result["mre"] = mre
        result["pred30"] = pred30
        epoch_text += f"  mre={mre:.4f}  pred30={pred30:.4f}"

      print(epoch_text)
      if CALCULATE_MRE and SAVE_METRIC == "PRED30":
        # save model based on PRED30
        if pred30 > fold_max_pred30:
          # use the predictions of this epoch
          fold_max_pred30 = pred30
          for i, index in enumerate(test_index):
            result_dataset.at[index, "prediction"] = preds[i]

        if pred30 > overall_max_pred30:
          # select this epoch as the best model
          overall_max_pred30 = pred30
          overall_best_model = copy.deepcopy(model)
          print(f"New overall max PRED30 ({pred30:.4f}). Saving as the best model...")

      elif SAVE_METRIC == "MSE":
        # save model based on MSE
        if mse < fold_min_mse:
          fold_min_mse = mse
          for i, index in enumerate(test_index):
            result_dataset.at[index, "prediction"] = preds[i]
        if test_loss < overall_min_mse:
          overall_min_mse = test_loss
          overall_best_model = copy.deepcopy(model)
          print(f"New overall min MSE ({mse:.4f}). Saving as the best model...")
      
      elif SAVE_METRIC == "ACCURACY":
        # save model based on accuracy
        if accuracy > fold_max_accuracy:
          fold_max_accuracy = accuracy
          for i, index in enumerate(test_index):
            result_dataset.at[index, "prediction"] = preds[i]
        if accuracy > overall_max_accuracy:
          overall_max_accuracy = accuracy
          overall_best_model = copy.deepcopy(model)
          print(f"New overall max accuracy ({accuracy:.4f}). Saving as the best model...")

      results.append(result)
      out += epoch_text + "\n"

  if CALCULATE_MRE:
    max_pred30 = 0
    max_index = -1
    for i, result in enumerate(results):
      if result["pred30"] > max_pred30:
        max_pred30 = result["pred30"]
        max_index = i

    pred30s.append(results[max_index]["pred30"])
    mres.append(results[max_index]["mre"])
    mses.append(results[max_index]["mse"])
    maes.append(results[max_index]["mae"])
    nmaes.append(results[max_index]["nmae"])
  else:
    min_mse = 0
    min_index = -1
    for i, result in enumerate(results):
      if result["mse"] < min_mse:
        min_mse = result["mse"]
        min_index = i
    mses.append(results[min_index]["mse"])
    maes.append(results[min_index]["mae"])
    nmaes.append(results[min_index]["nmae"])

              Model Name: SE-BERT
          Training Scope: Cosmic Point
        Number of Splits: 5
                 Shuffle: True

                    FOLD: 1

Epoch 01:  train_loss=0.2019  test_loss=0.1390  mae=0.2314  nmae=0.2077  mse=0.1337  acc=0.8713
New overall max accuracy (0.8713). Saving as the best model...
Epoch 02:  train_loss=0.1093  test_loss=0.1382  mae=0.2421  nmae=0.2173  mse=0.1321  acc=0.8614
Epoch 03:  train_loss=0.0777  test_loss=0.1349  mae=0.2126  nmae=0.1908  mse=0.1259  acc=0.8762
New overall max accuracy (0.8762). Saving as the best model...
Epoch 04:  train_loss=0.0566  test_loss=0.1488  mae=0.2675  nmae=0.2402  mse=0.1433  acc=0.8317
Epoch 05:  train_loss=0.0345  test_loss=0.1666  mae=0.2598  nmae=0.2332  mse=0.1622  acc=0.8168
Epoch 06:  train_loss=0.0243  test_loss=0.1509  mae=0.2305  nmae=0.2069  mse=0.1441  acc=0.8416
Epoch 07:  train_loss=0.0166  test_loss=0.1376  mae=0.2047  nmae=0.1838  mse=0.1276  acc=0.8663
Epoch 08:  train_loss=0.0139  test_loss=

## PRINT AVERAGES

In [13]:
out_footer = ""
print()

mae_text= f"   Average MAE: {np.mean(maes):.4f}"
out_footer += mae_text+"\n"
nmae_text= f"  Average NMAE: {np.mean(nmaes):.4f}"
out_footer += nmae_text+"\n"
mse_text=  f"   Average MSE: {np.mean(mses):.4f}"
out_footer += mse_text+"\n"

if CALCULATE_MRE:
  mre_text= f"   Average MRE: {np.mean(mres):.4f}"
  out_footer += mre_text+"\n"
  pred30_text= f"Average PRED30: {np.mean(pred30s):.4f}"
  out_footer += pred30_text+"\n"

predictions = result_dataset["prediction"].values
rounded_predictions = [round(pred) for pred in predictions]
actuals = result_dataset[TARGET_COL].values
accuracy = np.mean(rounded_predictions == actuals)
accuracy_text = f"   Average ACC: {accuracy:.4f}"
out_footer += accuracy_text

print("         Model:", MODEL_NAME)
print("        Target:", TARGET_COL)
print("  Dataset Size:", len(dataset))
print("  Num of Folds:", N_SPLITS)
print(out_footer)



         Model: SE-BERT
        Target: cosmic_exit
  Dataset Size: 1007
  Num of Folds: 5
   Average MAE: 0.1860
  Average NMAE: 0.1658
   Average MSE: 0.1134
   Average ACC: 0.8858


In [14]:
f = open(results_folder+"/result.txt", "w+")
f.write(out + out_footer)
f.close()
print("RUN FINISHED...")

RUN FINISHED...


In [15]:
result_dataset.to_csv(results_folder + "/predictions.csv", index=False)
print("PREDICTIONS SAVED FINISHED...")
overall_best_model.save_pretrained(results_folder + "/{}_{:.4f}".format(folder_name, accuracy))
print("MODEL SAVED FINISHED...")

PREDICTIONS SAVED FINISHED...
MODEL SAVED FINISHED...


## Merge all predictions.csv under /results/*


In [16]:
# import os
# import pandas as pd
# PROJECT_DIR = '.'
# results_folder = PROJECT_DIR + "/results"
# folders = os.listdir(results_folder)
# folders = [folder for folder in folders if folder.startswith("BERT") or folder.startswith("bert") or folder.startswith("SE-BERT")]
# folders = sorted(folders)
# for f in folders:
#     print(f)

# dfs = []
# for folder in folders:
#   folder_path = results_folder + "/" + folder
#   prediction_file = folder_path + "/predictions.csv"
#   df = pd.read_csv(prediction_file)
#   # get only the predictions column
#   df = df.iloc[:,2]
#   dfs.append(df)

# new_col_names = []
# for f in folders:
#     new_col_names.append(f.split("UserStory_1007-")[0] + f.split("UserStory_1007-")[1])

# merged_df = pd.concat(dfs, axis=1)
# merged_df.columns = new_col_names
# merged_df.to_csv(results_folder + "/merged_predictions.csv", index=False)
# print("MERGED PREDICTIONS SAVED FINISHED...")