# Task
Explain the step-by-step workflow for training a RoBERTa model for risk estimation using provided training data in a CSV format. The output should be an ordinal scale (1=Very Low Risk, 2=Low Risk, 3=Medium Risk, 4=High Risk, 5=Very High Risk). Do not include data augmentation in the workflow.

## Load data

### Subtask:
Load the training data from the provided CSV file into a pandas DataFrame.


**Reasoning**:
Import the pandas library and load the CSV data into a DataFrame.



In [None]:
import pandas as pd

# Load data from Excel file
try:
    df = pd.read_excel("Combined_Training_Data_Final.xlsx")
    print("Data loaded successfully!")
    print("First 5 rows of the DataFrame:")
    display(df.head())
    print("\nDataFrame Info:")
    display(df.info())
except FileNotFoundError:
    print("Error: Combined_Training_Data_Final.xlsx not found. Please upload the file.")
except Exception as e:
    print(f"An error occurred while reading the Excel file: {e}")

Data loaded successfully!
First 5 rows of the DataFrame:


Unnamed: 0,Requirement_ID,Requirement_Text,Risk,Urgency,Complexity,Business_Value,Implementation_Effort,Technical_Risk,Stakeholder_Criticality,Requirement_Stability,Security_Sensitivity,Num_Dependencies,Num_Supported_By,Is_Leaf_Requirement,Is_Root_Requirement,Priority_Score
0,RQ-01-01,The system shall allow users to manually log t...,4,4,1,4,1,1,5,4,4,3,5,0,0,0.86
1,RQ-01-02,The system shall generate medication reminders...,4,4,3,4,3,3,5,4,3,1,2,0,0,0.86
2,RQ-01-03,The system shall allow users to export health ...,1,3,1,3,1,1,4,4,4,4,4,0,0,0.66
3,RQ-01-04,The system shall restrict access to patient da...,5,5,3,5,3,3,5,4,5,0,10,0,1,1.0
4,RQ-01-05,The app shall notify users if their vital sign...,4,4,3,4,3,3,5,4,4,4,3,0,0,0.86



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290 entries, 0 to 289
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Requirement_ID           290 non-null    object 
 1   Requirement_Text         290 non-null    object 
 2   Risk                     290 non-null    int64  
 3   Urgency                  290 non-null    int64  
 4   Complexity               290 non-null    int64  
 5   Business_Value           290 non-null    int64  
 6   Implementation_Effort    290 non-null    int64  
 7   Technical_Risk           290 non-null    int64  
 8   Stakeholder_Criticality  290 non-null    int64  
 9   Requirement_Stability    290 non-null    int64  
 10  Security_Sensitivity     290 non-null    int64  
 11  Num_Dependencies         290 non-null    int64  
 12  Num_Supported_By         290 non-null    int64  
 13  Is_Leaf_Requirement      290 non-null    int64  
 14  Is_Root_R

None

In [None]:
# Ensure the 'Requirement_Text' and 'Risk' columns are present
required_columns = ['Requirement_Text', 'Risk']
if not all(col in df.columns for col in required_columns):
    raise ValueError(f"The loaded data must contain the following columns: {required_columns}")

# Drop rows where 'Risk' is missing
df_cleaned = df.dropna(subset=['Risk']).copy()

# --- Add this section to check data distribution ---
print("Risk level distribution before training:")
print(df_cleaned['Risk'].value_counts())
print("-" * 30)

# Prepare texts and labels
texts = df_cleaned['Requirement_Text'].astype(str).tolist()

# Capture the categories before encoding
risk_categories_mapping = df_cleaned['Risk'].astype('category').cat.categories.tolist()
labels = df_cleaned['Risk'].astype('category').cat.codes.tolist()  # Encode risk levels as integers

# --- Add this section to verify label mapping consistency ---
print("Encoded label to original category mapping used for training:")
for code, category in enumerate(risk_categories_mapping):
    print(f"  Code {code}: {category}")
print("-" * 30)

print(f"Number of data points after cleaning: {len(df_cleaned)}")
print(f"Number of texts: {len(texts)}")
print(f"Number of labels: {len(labels)}")

Risk level distribution before training:
Risk
3    86
4    76
1    57
5    42
2    29
Name: count, dtype: int64
------------------------------
Encoded label to original category mapping used for training:
  Code 0: 1
  Code 1: 2
  Code 2: 3
  Code 3: 4
  Code 4: 5
------------------------------
Number of data points after cleaning: 290
Number of texts: 290
Number of labels: 290


In [None]:
from sklearn.model_selection import train_test_split

# Split data
# Use stratify=labels to maintain class distribution in train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

print(f"Number of training samples: {len(train_texts)}")
print(f"Number of validation samples: {len(val_texts)}")
print(f"Training labels distribution: {pd.Series(train_labels).value_counts(normalize=True)}")
print(f"Validation labels distribution: {pd.Series(val_labels).value_counts(normalize=True)}")

Number of training samples: 232
Number of validation samples: 58
Training labels distribution: 2    0.297414
3    0.262931
0    0.193966
4    0.146552
1    0.099138
Name: proportion, dtype: float64
Validation labels distribution: 2    0.293103
3    0.258621
0    0.206897
4    0.137931
1    0.103448
Name: proportion, dtype: float64


In [None]:
from transformers import RobertaTokenizer
from torch.utils.data import Dataset
import torch

# Tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Custom Dataset
class RequirementsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        # Tokenizer can handle lists of strings
        print("Tokenizing data...") # Add print to see tokenization progress
        # Added return_token_type_ids=False as RoBERTa doesn't use them and it can cause issues
        self.encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt', return_token_type_ids=False)
        # Move encodings to CPU if they are on GPU after tokenization (depends on tokenizer version/settings)
        # Ensure data is on CPU before creating Dataset
        self.encodings = {k: v.cpu() for k, v in self.encodings.items()}
        # Convert labels to a torch tensor
        self.labels = torch.tensor(labels, dtype=torch.long) # Ensure labels are long type
        print("Tokenization finished.") # Add print

    def __getitem__(self, idx):
        # item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Use .clone() or just indexing if self.encodings is already a tensor
        # .detach() is often not necessary when simply indexing
        item = {key: val[idx].clone() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx].clone() # Labels are already a tensor
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets with the split data
train_dataset = RequirementsDataset(train_texts, train_labels, tokenizer)
val_dataset = RequirementsDataset(val_texts, val_labels, tokenizer)

print("Training and validation datasets created.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Tokenizing data...
Tokenization finished.
Tokenizing data...
Tokenization finished.
Training and validation datasets created.


In [None]:
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments

# Model
# num_labels was determined in a previous cell from len(risk_categories_mapping)
num_labels = len(risk_categories_mapping)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5, # Increased epochs slightly for better training
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    report_to=None, # Disable wandb reporting if not needed
    # Consider adding a metric like accuracy if you need it for evaluation
    # compute_metrics=lambda p: {"accuracy": (p.predictions.argmax(-1) == p.label_ids).mean()}
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    # Pass compute_metrics if you added it to TrainingArguments
    # compute_metrics=lambda p: {"accuracy": (p.predictions.argmax(-1) == p.label_ids).mean()}
)

print("Model, Training Arguments, and Trainer initialized.")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model, Training Arguments, and Trainer initialized.


In [None]:
# Train
print("Starting training...")
trainer.train()
print("Training finished.")

Starting training...


Epoch,Training Loss,Validation Loss
1,1.5938,1.518891
2,1.5339,1.266705


Epoch,Training Loss,Validation Loss
1,1.5938,1.518891
2,1.5339,1.266705
3,1.0078,0.895417
4,0.5613,0.789407
5,0.37,0.806254


Training finished.


## Evaluate the model
Evaluate the trained model on the validation dataset to assess its performance.

In [11]:
# Evaluate the model
print("Evaluating the model...")
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

Evaluating the model...


Evaluation results: {'eval_loss': 0.7894072532653809, 'eval_runtime': 21.6557, 'eval_samples_per_second': 2.678, 'eval_steps_per_second': 0.369, 'epoch': 5.0}


## Risk estimation from requirement statement
Use the trained model to estimate the risk of new requirement statements.

In [13]:
# Example usage of the estimate_risk function
print("\nEstimating risk for example sentences after training:")
print(f"Requirement: 'The system must allow users to reset their passwords via email.'")
print(f"Estimated Risk: {estimate_risk('The system must allow users to reset their passwords via email.', model, tokenizer, risk_categories_mapping)}")
print("-" * 30)

print(f"Requirement: 'The system shall use two factor authentication for login.'")
print(f"Estimated Risk: {estimate_risk('The system shall use two factor authentication for login.', model, tokenizer, risk_categories_mapping)}")
print("-" * 30)

print(f"Requirement: 'User data shall be encrypted at rest and in transit.'")
print(f"Estimated Risk: {estimate_risk('User data shall be encrypted at rest and in transit.', model, tokenizer, risk_categories_mapping)}")
print("-" * 30)

print(f"Requirement: 'The system will have a simple user interface.'")
print(f"Estimated Risk: {estimate_risk('The system will have a simple user interface.', model, tokenizer, risk_categories_mapping)}")
print("-" * 30)


Estimating risk for example sentences after training:
Requirement: 'The system must allow users to reset their passwords via email.'
Raw logits for 'The system must allow users to reset their passwords via email.': tensor([[-1.8501, -2.3746, -0.8856,  2.5192,  1.7479]])
Predicted index: 3, Mapped Risk Level: 4
Estimated Risk: 4
------------------------------
Requirement: 'The system shall use two factor authentication for login.'
Raw logits for 'The system shall use two factor authentication for login.': tensor([[-1.8744, -2.1752, -0.8506,  2.5516,  1.5820]])
Predicted index: 3, Mapped Risk Level: 4
Estimated Risk: 4
------------------------------
Requirement: 'User data shall be encrypted at rest and in transit.'
Raw logits for 'User data shall be encrypted at rest and in transit.': tensor([[-1.6687, -1.8952, -1.2958,  1.1263,  3.0838]])
Predicted index: 4, Mapped Risk Level: 5
Estimated Risk: 5
------------------------------
Requirement: 'The system will have a simple user interface

In [14]:
# Requirement 1: Existing (Healthcare, Non-Functional, RQ-04-07)
print(f"Requirement: 'The system shall support two-factor authentication for user logins.'")
print(f"Estimated Risk: {estimate_risk('The system shall support two-factor authentication for user logins.', model, tokenizer, risk_categories_mapping)}")
print("-" * 30)
# Actual Risk: 5 (from dataset, RQ-04-07, high due to security sensitivity)

# Requirement 2: Existing (Logistics, Functional, RQ-05-14)
print(f"Requirement: 'The system shall allow customers to upload proof of delivery.'")
print(f"Estimated Risk: {estimate_risk('The system shall allow customers to upload proof of delivery.', model, tokenizer, risk_categories_mapping)}")
print("-" * 30)
# Actual Risk: 3 (from dataset, RQ-05-14, moderate due to user interaction and data upload)

# Requirement 3: Existing (Smart City, Functional, RQ-08-02)
print(f"Requirement: 'The system shall provide public dashboards for traffic conditions.'")
print(f"Estimated Risk: {estimate_risk('The system shall provide public dashboards for traffic conditions.', model, tokenizer, risk_categories_mapping)}")
print("-" * 30)
# Actual Risk: 1 (from dataset, RQ-08-02, low due to simple public data display)

# Requirement 4: Existing (Education, Functional, RQ-04-05)
print(f"Requirement: 'The system shall allow students to view their grades online.'")
print(f"Estimated Risk: {estimate_risk('The system shall allow students to view their grades online.', model, tokenizer, risk_categories_mapping)}")
print("-" * 30)
# Actual Risk: 3 (from dataset, RQ-04-05, moderate due to sensitive data access)

# Requirement 5: Existing (Agriculture, Non-Functional, RQ-09-07)
print(f"Requirement: 'The system shall encrypt sensitive farm data at rest and in transit.'")
print(f"Estimated Risk: {estimate_risk('The system shall encrypt sensitive farm data at rest and in transit.', model, tokenizer, risk_categories_mapping)}")
print("-" * 30)
# Actual Risk: 5 (from dataset, RQ-09-07, high due to encryption and security)

# Requirement 6: New (Finance, Non-Functional)
print(f"Requirement: 'The system shall ensure transaction processing latency under 1 second.'")
print(f"Estimated Risk: {estimate_risk('The system shall ensure transaction processing latency under 1 second.', model, tokenizer, risk_categories_mapping)}")
print("-" * 30)
# Estimated Risk: 5 (based on similar requirements like RQ-06-15, high due to real-time performance and financial impact)

# Requirement 7: New (Aviation, Functional)
print(f"Requirement: 'The system shall allow pilots to view real-time weather updates during flights.'")
print(f"Estimated Risk: {estimate_risk('The system shall allow pilots to view real-time weather updates during flights.', model, tokenizer, risk_categories_mapping)}")
print("-" * 30)
# Estimated Risk: 4 (based on RQ-13-12, high due to real-time data and safety implications)

# Requirement 8: New (Gaming, Functional)
print(f"Requirement: 'The system shall allow players to customize in-game character appearances.'")
print(f"Estimated Risk: {estimate_risk('The system shall allow players to customize in-game character appearances.', model, tokenizer, risk_categories_mapping)}")
print("-" * 30)
# Estimated Risk: 1 (based on RQ-14-09, low due to simple UI customization)

# Requirement 9: New (Healthcare, Functional)
print(f"Requirement: 'The system shall allow patients to download their lab results in PDF format.'")
print(f"Estimated Risk: {estimate_risk('The system shall allow patients to download their lab results in PDF format.', model, tokenizer, risk_categories_mapping)}")
print("-" * 30)
# Estimated Risk: 3 (based on RQ-01-03, moderate due to sensitive data handling)

# Requirement 10: New (Smart City, Non-Functional)
print(f"Requirement: 'The system shall support secure storage of public transport schedules.'")
print(f"Estimated Risk: {estimate_risk('The system shall support secure storage of public transport schedules.', model, tokenizer, risk_categories_mapping)}")
print("-" * 30)
# Estimated Risk: 4 (based on RQ-13-97, high due to secure storage and public infrastructure)

Requirement: 'The system shall support two-factor authentication for user logins.'
Raw logits for 'The system shall support two-factor authentication for user logins.': tensor([[-1.9523, -2.2133, -0.7271,  2.6427,  1.4540]])
Predicted index: 3, Mapped Risk Level: 4
Estimated Risk: 4
------------------------------
Requirement: 'The system shall allow customers to upload proof of delivery.'
Raw logits for 'The system shall allow customers to upload proof of delivery.': tensor([[ 0.1035, -1.5858,  1.7426,  1.0838, -1.1443]])
Predicted index: 2, Mapped Risk Level: 3
Estimated Risk: 3
------------------------------
Requirement: 'The system shall provide public dashboards for traffic conditions.'
Raw logits for 'The system shall provide public dashboards for traffic conditions.': tensor([[ 2.1594,  1.3898,  0.0134, -1.1900, -1.9954]])
Predicted index: 0, Mapped Risk Level: 1
Estimated Risk: 1
------------------------------
Requirement: 'The system shall allow students to view their grades on

## Save the Trained Model and Tokenizer
Save the trained RoBERTa model and tokenizer to disk so they can be loaded later for inference.

In [15]:
# Save model and tokenizer
model.save_pretrained('./risk_estimation_roberta')
tokenizer.save_pretrained('./risk_estimation_roberta')

print("Model and tokenizer saved successfully to './risk_estimation_roberta'")

Model and tokenizer saved successfully to './risk_estimation_roberta'


## Load and Use the Model in FastAPI

To use the saved model in a FastAPI application, you'll need to:

1.  **Install necessary libraries:** Make sure you have `fastapi`, `uvicorn`, `transformers`, `torch`, and `pandas` installed in your environment.

In [16]:
from fastapi import FastAPI
from pydantic import BaseModel
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import pandas as pd # Import pandas to get the risk_categories_mapping

# Define the categories based on your training data
# Replace this with the actual categories from your training data loading
# It's best to save and load this mapping alongside your model
# For demonstration, assuming you have this mapping available
RISK_CATEGORIES = [1, 2, 3, 4, 5] # Example, replace with your actual categories

# Load the saved model and tokenizer
model_path = './risk_estimation_roberta'
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = RobertaForSequenceClassification.from_pretrained(model_path)

# Set model to evaluation mode
model.eval()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

app = FastAPI()

class Requirement(BaseModel):
    text: str

@app.post("/estimate_risk/")
async def estimate_risk_endpoint(requirement: Requirement):
    """
    Estimates the risk level of a given requirement text using the trained RoBERTa model.
    """
    inputs = tokenizer(requirement.text, return_tensors='pt', truncation=True, padding='max_length', max_length=128)

    # Move inputs to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits.cpu()
    pred = torch.argmax(logits, dim=1).item()

    # Map the predicted index back to the original risk level
    # Ensure the index is within the bounds of the categories
    if 0 <= pred < len(RISK_CATEGORIES):
        risk_level = RISK_CATEGORIES[pred]
    else:
        risk_level = "Error: Could not map prediction to a risk level."

    return {"requirement_text": requirement.text, "estimated_risk": risk_level}

# To run this FastAPI application:
# 1. Save the code above as main.py
# 2. Open your terminal, navigate to the directory where you saved main.py
# 3. Run the command: uvicorn main:app --reload
# The API will be running on http://127.0.0.1:8000
# You can then send POST requests to http://127.0.0.1:8000/estimate_risk/