In [23]:
pip install liac-arff



DATA PREPROCESSING

In [24]:
!pip install transformers datasets torch scikit-learn

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [25]:
import arff
import pandas as pd

In [26]:
with open("nfr.arff", "r") as f:
    arff_data = arff.load(f)

In [27]:
data = arff_data['data']
columns = [attr[0] for attr in arff_data['attributes']]

df = pd.DataFrame(data, columns=columns)

In [28]:
df.head(5)

Unnamed: 0,ProjectID,RequirementText,class
0,1,The system shall refresh the display every 60 ...,PE
1,1,The application shall match the color of the s...,LF
2,1,If projected the data must be readable. On ...,US
3,1,The product shall be available during normal ...,A
4,1,If projected the data must be understandable...,US


In [29]:
df['class'] = df['class'].apply(lambda x: "FR" if x == "F" else "NFR")

In [30]:
df.head(10)

Unnamed: 0,ProjectID,RequirementText,class
0,1,The system shall refresh the display every 60 ...,NFR
1,1,The application shall match the color of the s...,NFR
2,1,If projected the data must be readable. On ...,NFR
3,1,The product shall be available during normal ...,NFR
4,1,If projected the data must be understandable...,NFR
5,1,The product shall ensure that it can only be a...,NFR
6,1,The product shall be intuitive and self-explan...,NFR
7,1,The product shall respond fast to keep up-to-d...,NFR
8,1,The system shall have a MDI form that allows f...,FR
9,1,The system shall display Events in a vertical ...,FR


In [31]:
label_mapping = {"FR": 0, "NFR": 1}
df["class"] = df["class"].map(label_mapping)

In [32]:
df.head(10)

Unnamed: 0,ProjectID,RequirementText,class
0,1,The system shall refresh the display every 60 ...,1
1,1,The application shall match the color of the s...,1
2,1,If projected the data must be readable. On ...,1
3,1,The product shall be available during normal ...,1
4,1,If projected the data must be understandable...,1
5,1,The product shall ensure that it can only be a...,1
6,1,The product shall be intuitive and self-explan...,1
7,1,The product shall respond fast to keep up-to-d...,1
8,1,The system shall have a MDI form that allows f...,0
9,1,The system shall display Events in a vertical ...,0


In [33]:
from sklearn.model_selection import train_test_split

train_data, test_data, train_label, test_label = train_test_split(df["RequirementText"].to_list(), df["class"].to_list(), test_size=0.2, random_state=42)

In [34]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(train_data, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_data, truncation=True, padding=True, max_length=512)

In [35]:
import torch
from torch.utils.data import Dataset

class RequirementDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

In [36]:
train_dataset = RequirementDataset(train_encodings, train_label)
test_dataset = RequirementDataset(test_encodings, test_label)

In [37]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
training_args = TrainingArguments(
    output_dir="./bert_promise",
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none"
)


In [42]:
!pip install wandb --upgrade

Collecting wandb
  Downloading wandb-0.19.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Downloading wandb-0.19.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.1/20.1 MB[0m [31m88.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: wandb
  Attempting uninstall: wandb
    Found existing installation: wandb 0.18.7
    Uninstalling wandb-0.18.7:
      Successfully uninstalled wandb-0.18.7
Successfully installed wandb-0.19.0


In [45]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.252407
2,No log,0.262837
3,No log,0.198403


TrainOutput(global_step=189, training_loss=0.2411895025344122, metrics={'train_runtime': 23.3067, 'train_samples_per_second': 64.359, 'train_steps_per_second': 8.109, 'total_flos': 80937482850000.0, 'train_loss': 0.2411895025344122, 'epoch': 3.0})

In [46]:
results = trainer.evaluate()
print("Evaluation Results:", results)

Evaluation Results: {'eval_loss': 0.19840320944786072, 'eval_runtime': 0.2088, 'eval_samples_per_second': 598.726, 'eval_steps_per_second': 76.637, 'epoch': 3.0}


In [50]:
from sklearn.metrics import classification_report

predictions, labels, _ = trainer.predict(test_dataset)
preds = predictions.argmax(axis=1)
print(classification_report(test_label, preds))

              precision    recall  f1-score   support

           0       0.90      0.98      0.94        47
           1       0.99      0.94      0.96        78

    accuracy                           0.95       125
   macro avg       0.94      0.96      0.95       125
weighted avg       0.95      0.95      0.95       125



In [51]:
model.save_pretrained("./promise_fine_tuned_bert")
tokenizer.save_pretrained("./promise_fine_tuned_bert")

('./promise_fine_tuned_bert/tokenizer_config.json',
 './promise_fine_tuned_bert/special_tokens_map.json',
 './promise_fine_tuned_bert/vocab.txt',
 './promise_fine_tuned_bert/added_tokens.json')

In [53]:
from transformers import pipeline

classifier = pipeline("text-classification", model="./promise_fine_tuned_bert", tokenizer="./promise_fine_tuned_bert")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [54]:
df = pd.read_csv("medical_req_dataset.csv")

texts = df['Requirement Text'].tolist()
true_labels = df['Label'].tolist()

predicted_labels = []
for text in texts:
    result = classifier(text)
    predicted_label = int(result[0]['label'].split('_')[-1])
    predicted_labels.append(predicted_label)

label_mapping = {0: "FR", 1: "NFR"}

predicted_text_labels = [label_mapping[label] for label in predicted_labels]

In [55]:
from sklearn.metrics import classification_report

report = classification_report(true_labels, predicted_text_labels, target_names=["FR", "NFR"])
print(report)

              precision    recall  f1-score   support

          FR       1.00      0.37      0.54       158
         NFR       0.62      1.00      0.76       162

    accuracy                           0.69       320
   macro avg       0.81      0.68      0.65       320
weighted avg       0.81      0.69      0.65       320



In [56]:
df['predicted_label'] = predicted_text_labels
df.to_csv("bert_promise_predicted_dataset.csv", index=False)

In [57]:
!cp -r /content/promise_fine_tuned_bert/ /content/drive/My\ Drive/

cp: cannot create directory '/content/drive/My Drive/': No such file or directory
