In [1]:
import pandas as pd
from openai import OpenAI
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

In [None]:
df = pd.read_csv("medical_req_dataset.csv")

In [12]:
def get_openai_client():
    with open("../openai_api_key.txt", "r") as file:
        openai_api_key = file.read().strip()
        return OpenAI(api_key=openai_api_key)

In [4]:
def generate_label(model, client, text):
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": f"Classify the following requirement as either 'FR' (Functional "
                                                  f"Requirement) or 'NFR' (Non-Functional Requirement): {text}"}]
        )
        label = response.choices[0].message.content.strip()
        return label
    except Exception as e:
        print(f"Error generating label for text: {text}\n{e}")
        return None

In [5]:
def generate_labeled_dataset(model):
    client = get_openai_client()
    model_id = model.split("::")[1]

    for index, row in df.iterrows():
        text = row["Requirement Text"]
        gen_label = generate_label(model, client, text)
        df.at[index, "Generated_Label"] = gen_label

    df.to_csv(f"{model_id}_labeled_dataset.csv", index=False)

In [9]:
def metrics(model):
    df = pd.read_csv(f"{model}_labeled_dataset.csv")
    true_labels = df['Label']
    generated_labels = df['Generated_Label']

    accuracy = accuracy_score(true_labels, generated_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, generated_labels, average="weighted")

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    print("\nClassification Report:")
    print(classification_report(true_labels, generated_labels))

In [10]:
model1 = "ft:gpt-4o-mini-2024-07-18:personal::APIxXqIF" #PURE
model2 = "ft:gpt-4o-mini-2024-07-18:personal::APdkJsuS" #PROMISE

generate_labeled_dataset(model1)
model_id = model1.split("::")[1]
metrics(model_id)


generate_labeled_dataset(model2)
model_id = model2.split("::")[1]
metrics(model_id)

Accuracy: 0.95625
Precision: 0.9569125234521575
Recall: 0.95625
F1 Score: 0.9562448728465955

Classification Report:
              precision    recall  f1-score   support

          FR       0.94      0.97      0.96       158
         NFR       0.97      0.94      0.96       162

    accuracy                           0.96       320
   macro avg       0.96      0.96      0.96       320
weighted avg       0.96      0.96      0.96       320
Accuracy: 0.765625
Precision: 0.8098115560949299
Recall: 0.765625
F1 Score: 0.7590278478458279

Classification Report:
              precision    recall  f1-score   support

          FR       0.69      0.95      0.80       158
         NFR       0.92      0.59      0.72       162
  proposalFR       0.00      0.00      0.00         0

    accuracy                           0.77       320
   macro avg       0.54      0.51      0.51       320
weighted avg       0.81      0.77      0.76       320


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
