In [1]:
!pip install transformers torch scikit-learn



DATA PRE-PROCESSING

In [2]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_excel("FR_NFR_Dataset.xlsx", dtype=str)
df['Type'] = df['Type'].apply(lambda x: x.strip() if isinstance(x, str) else "Unknown")
if df['Type'].isna().sum() > 0:
    print("NaN values found after processing:", df[df['Type'].isna()])

In [4]:
df = df[df["Type"] != "Unknown"]

In [5]:
label_mapping = {"FR": 0, "NFR": 1}
df["Type"] = df["Type"].map(label_mapping)

In [6]:
train_data, test_data, train_label, test_label = train_test_split(df["Requirement Text"].to_list(), df["Type"].to_list(), test_size=0.2, random_state=42)

TOKENIZE DATA

In [7]:
from transformers import BertTokenizer

In [8]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(train_data, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_data, truncation=True, padding=True, max_length=512)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

PYTORCH DATASET

In [9]:
import torch
from torch.utils.data import Dataset

class RequirementDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

In [10]:
train_dataset = RequirementDataset(train_encodings, train_label)
test_dataset = RequirementDataset(test_encodings, test_label)

TRAINING

In [11]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

In [12]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
import torch
import transformers
import accelerate

print("PyTorch version:", torch.__version__)
print("Transformers version:", transformers.__version__)
print("Accelerate version:", accelerate.__version__)


PyTorch version: 2.5.1+cu121
Transformers version: 4.46.2
Accelerate version: 1.1.1


In [14]:
training_args = TrainingArguments(
    output_dir="./pure_bert",
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.4605,0.380681
2,0.3703,0.386677
3,0.3,0.392105


TrainOutput(global_step=1827, training_loss=0.35679832337394596, metrics={'train_runtime': 174.1693, 'train_samples_per_second': 83.849, 'train_steps_per_second': 10.49, 'total_flos': 487814063302800.0, 'train_loss': 0.35679832337394596, 'epoch': 3.0})

In [16]:
results = trainer.evaluate()
print("Evaluation Results:", results)

Evaluation Results: {'eval_loss': 0.39210471510887146, 'eval_runtime': 1.978, 'eval_samples_per_second': 615.787, 'eval_steps_per_second': 77.353, 'epoch': 3.0}


In [17]:
model.save_pretrained("./fine_tuned_pure_bert")
tokenizer.save_pretrained("./fine_tuned_pure_bert")

('./fine_tuned_pure_bert/tokenizer_config.json',
 './fine_tuned_pure_bert/special_tokens_map.json',
 './fine_tuned_pure_bert/vocab.txt',
 './fine_tuned_pure_bert/added_tokens.json')

In [18]:
from sklearn.metrics import classification_report

predictions, labels, _ = trainer.predict(test_dataset)
preds = predictions.argmax(axis=1)
print(classification_report(test_label, preds))

              precision    recall  f1-score   support

           0       0.91      0.91      0.91       790
           1       0.84      0.84      0.84       428

    accuracy                           0.89      1218
   macro avg       0.87      0.87      0.87      1218
weighted avg       0.89      0.89      0.89      1218



In [19]:
from transformers import pipeline

classifier = pipeline("text-classification", model="./fine_tuned_pure_bert", tokenizer="./fine_tuned_pure_bert")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [20]:
df = pd.read_csv("medical_req_dataset.csv")

texts = df['Requirement Text'].tolist()
true_labels = df['Label'].tolist()

predicted_labels = []
for text in texts:
    result = classifier(text)
    predicted_label = int(result[0]['label'].split('_')[-1])
    predicted_labels.append(predicted_label)

label_mapping = {0: "FR", 1: "NFR"}

predicted_text_labels = [label_mapping[label] for label in predicted_labels]

In [21]:
from sklearn.metrics import classification_report

report = classification_report(true_labels, predicted_text_labels, target_names=["FR", "NFR"])
print(report)

              precision    recall  f1-score   support

          FR       0.98      0.85      0.91       158
         NFR       0.87      0.98      0.92       162

    accuracy                           0.92       320
   macro avg       0.93      0.92      0.92       320
weighted avg       0.93      0.92      0.92       320

