In [2]:
!pip install liac-arff

Collecting liac-arff
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: liac-arff
  Building wheel for liac-arff (setup.py) ... [?25l[?25hdone
  Created wheel for liac-arff: filename=liac_arff-2.5.0-py3-none-any.whl size=11716 sha256=2311784f3d5f6c0aacec0284ff02d2b56e17eb788fbd10463c662afa7749f188
  Stored in directory: /root/.cache/pip/wheels/5d/2a/9c/3895d9617f8f49a0883ba686326d598e78a1c2f54fe3cae86d
Successfully built liac-arff
Installing collected packages: liac-arff
Successfully installed liac-arff-2.5.0


In [3]:
!pip install transformers datasets torch scikit-learn

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [6]:
import arff
import pandas as pd

with open("nfr.arff", "r") as f:
    arff_data = arff.load(f)

data = arff_data['data']
columns = [attr[0] for attr in arff_data['attributes']]

df = pd.DataFrame(data, columns=columns)
df.head(5)

Unnamed: 0,ProjectID,RequirementText,class
0,1,The system shall refresh the display every 60 ...,PE
1,1,The application shall match the color of the s...,LF
2,1,If projected the data must be readable. On ...,US
3,1,The product shall be available during normal ...,A
4,1,If projected the data must be understandable...,US


In [7]:
df['class'] = df['class'].apply(lambda x: "FR" if x == "F" else "NFR")
df.head(10)

Unnamed: 0,ProjectID,RequirementText,class
0,1,The system shall refresh the display every 60 ...,NFR
1,1,The application shall match the color of the s...,NFR
2,1,If projected the data must be readable. On ...,NFR
3,1,The product shall be available during normal ...,NFR
4,1,If projected the data must be understandable...,NFR
5,1,The product shall ensure that it can only be a...,NFR
6,1,The product shall be intuitive and self-explan...,NFR
7,1,The product shall respond fast to keep up-to-d...,NFR
8,1,The system shall have a MDI form that allows f...,FR
9,1,The system shall display Events in a vertical ...,FR


In [8]:
label_mapping = {"FR": 0, "NFR": 1}
df["class"] = df["class"].map(label_mapping)
df.head(10)

Unnamed: 0,ProjectID,RequirementText,class
0,1,The system shall refresh the display every 60 ...,1
1,1,The application shall match the color of the s...,1
2,1,If projected the data must be readable. On ...,1
3,1,The product shall be available during normal ...,1
4,1,If projected the data must be understandable...,1
5,1,The product shall ensure that it can only be a...,1
6,1,The product shall be intuitive and self-explan...,1
7,1,The product shall respond fast to keep up-to-d...,1
8,1,The system shall have a MDI form that allows f...,0
9,1,The system shall display Events in a vertical ...,0


In [9]:
from sklearn.model_selection import train_test_split

train_data, test_data, train_label, test_label = train_test_split(df["RequirementText"].to_list(), df["class"].to_list(), test_size=0.2, random_state=42)

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_texts(texts):
    return tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt",
    )

train_encodings = tokenize_texts(train_data)
test_encodings = tokenize_texts(test_data)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [11]:
import torch

class RequirementDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = RequirementDataset(train_encodings, train_label)
test_dataset = RequirementDataset(test_encodings, test_label)

In [12]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./promise_roberta",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
)



In [15]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [16]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,0.5711,0.35194
2,0.3064,0.171886
3,0.1463,0.16407


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=96, training_loss=0.36277757957577705, metrics={'train_runtime': 39.3159, 'train_samples_per_second': 38.153, 'train_steps_per_second': 2.442, 'total_flos': 98666645760000.0, 'train_loss': 0.36277757957577705, 'epoch': 3.0})

In [17]:
results = trainer.evaluate()
print("Evaluation Results:", results)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Evaluation Results: {'eval_loss': 0.1640702188014984, 'eval_runtime': 0.3462, 'eval_samples_per_second': 361.085, 'eval_steps_per_second': 23.109, 'epoch': 3.0}


In [18]:
model.save_pretrained("./promise_fine_tuned_roberta")
tokenizer.save_pretrained("./promise_fine_tuned_roberta")

('./promise_fine_tuned_roberta/tokenizer_config.json',
 './promise_fine_tuned_roberta/special_tokens_map.json',
 './promise_fine_tuned_roberta/vocab.json',
 './promise_fine_tuned_roberta/merges.txt',
 './promise_fine_tuned_roberta/added_tokens.json',
 './promise_fine_tuned_roberta/tokenizer.json')

In [21]:
from transformers import pipeline

classifier = pipeline("text-classification", model="./promise_fine_tuned_roberta", tokenizer="./promise_fine_tuned_roberta")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [22]:
from sklearn.metrics import classification_report

predictions, labels, _ = trainer.predict(test_dataset)
preds = predictions.argmax(axis=1)
print(classification_report(test_label, preds))

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


              precision    recall  f1-score   support

           0       0.94      0.94      0.94        47
           1       0.96      0.96      0.96        78

    accuracy                           0.95       125
   macro avg       0.95      0.95      0.95       125
weighted avg       0.95      0.95      0.95       125



In [23]:
df = pd.read_csv("medical_req_dataset.csv")

texts = df['Requirement Text'].tolist()
true_labels = df['Label'].tolist()

predicted_labels = []
for text in texts:
    result = classifier(text)
    predicted_label = int(result[0]['label'].split('_')[-1])
    predicted_labels.append(predicted_label)

In [24]:
label_mapping = {0: "FR", 1: "NFR"}

predicted_text_labels = [label_mapping[label] for label in predicted_labels]

In [25]:
from sklearn.metrics import classification_report

report = classification_report(true_labels, predicted_text_labels, target_names=["FR", "NFR"])
print(report)

              precision    recall  f1-score   support

          FR       0.98      0.54      0.70       158
         NFR       0.69      0.99      0.81       162

    accuracy                           0.77       320
   macro avg       0.83      0.77      0.76       320
weighted avg       0.83      0.77      0.76       320



In [26]:
df['predicted_label'] = predicted_text_labels
df.to_csv("predicted_dataset.csv", index=False)

In [28]:
from google.colab import drive
drive.mount('/content/drive')
!cp -r /content/promise_fine_tuned_roberta/ /content/drive/My\ Drive/

Mounted at /content/drive
