In [1]:
!pip install scikit-learn seaborn matplotlib coral-ordinal tensorflow tensorflow_hub

Collecting coral-ordinal
  Downloading coral_ordinal-0.1.8-py3-none-any.whl.metadata (3.3 kB)
Downloading coral_ordinal-0.1.8-py3-none-any.whl (9.1 kB)
Installing collected packages: coral-ordinal
Successfully installed coral-ordinal-0.1.8


In [2]:
!pip show coral-ordinal
!ls /usr/local/lib/python*/dist-packages/coral_ordinal

Name: coral-ordinal
Version: 0.1.8
Summary: Tensorflow Keras implementation of CORAL ordinal regression output layer, loss, activation, and metrics
Home-page: https://github.com/ck37/coral-ordinal
Author: Chris Kennedy, Stephen Matthews, Georg M. Goerg
Author-email: chrisken@gmail.com
License: MIT
Location: /usr/local/lib/python3.11/dist-packages
Requires: numpy, tensorflow
Required-by: 
activations.py	layer.py  metrics.py   version.py
__init__.py	loss.py   __pycache__


In [3]:
!pip install numpy pandas scikit-learn tensorflow keras keras-tuner sentence-transformers

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sen

In [4]:
!pip install coral-ordinal



In [7]:
# --- Script 1: Hyperparameter Tuning with CoralOrdinal ---
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
from sklearn.model_selection import train_test_split
import keras_tuner as kt
from sentence_transformers import SentenceTransformer
from coral_ordinal import CoralOrdinal, OrdinalCrossEntropy

# 1) Load data
# Changed to read_excel for .xlsx file
df = pd.read_excel('Combined_Training_Data_Final.xlsx')
df['Requirement_Text'] = df['Requirement_Text'].apply(lambda t: re.sub(r'[^a-z0-9\s]', '', str(t).lower()))
X_texts = df['Requirement_Text'].tolist()
y_ord = df['Requirement_Stability'].astype(np.int32).values.flatten() - 1  # 0-based

# 2) Embed
print("Loading HuggingFace model...")
embedder = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
print("Embedding...")
X_embeddings = embedder.encode(X_texts, batch_size=32, show_progress_bar=True)

# 3) Stratified split for tuning
X_train, X_val, y_train, y_val = train_test_split(
    X_embeddings, y_ord, test_size=0.2, stratify=y_ord, random_state=42
)

# 4) Define model builder
def build_model(hp):
    model = Sequential()
    model.add(Input(shape=(X_train.shape[1],)))
    units1 = hp.Int('units_1', 64, 256, step=32)
    model.add(Dense(units1, activation='relu'))
    if hp.Boolean('use_second'):
        units2 = hp.Int('units_2', 32, 128, step=32)
        model.add(Dense(units2, activation='relu'))
    dropout = hp.Float('dropout', 0.0, 0.4, step=0.1)
    model.add(Dropout(dropout))
    model.add(CoralOrdinal(num_classes=5))
    lr = hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')
    model.compile(optimizer=tf.keras.optimizers.Adam(lr), loss=OrdinalCrossEntropy())
    return model

tuner = kt.BayesianOptimization(
    build_model,
    objective='val_loss',
    max_trials=15,
    directory='kt_tuner_dir',
    project_name='ordinal_complexity_coral'
)

print("\n🔎 Starting Bayesian hyperparameter search...")
tuner.search(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=30,
    batch_size=32,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)],
    verbose=2
)

best_hp = tuner.get_best_hyperparameters(1)[0]
print("\n✅ Best hyperparameters found:", best_hp.values)

import pickle
with open('best_hyperparameters.pkl', 'wb') as f:
    pickle.dump(best_hp.values, f)
print("\n✅ Best hyperparameters saved to best_hyperparameters.pkl.")

Trial 15 Complete [00h 00m 10s]
val_loss: 1.301100254058838

Best val_loss So Far: 1.301100254058838
Total elapsed time: 00h 02m 22s

✅ Best hyperparameters found: {'units_1': 224, 'use_second': False, 'dropout': 0.4, 'learning_rate': 0.008235855327692132, 'units_2': 32}

✅ Best hyperparameters saved to best_hyperparameters.pkl.


In [8]:
# --- Script 2: K-Fold Evaluation with CoralOrdinal ---
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error, cohen_kappa_score
from scipy.stats import spearmanr
from sentence_transformers import SentenceTransformer
import pickle
from coral_ordinal import CoralOrdinal, OrdinalCrossEntropy
import openpyxl # Import openpyxl engine

# 1) Load data
# Changed to read_excel for .xlsx file
df = pd.read_excel('Combined_Training_Data_Final.xlsx')
df['Requirement_Text'] = df['Requirement_Text'].apply(lambda t: re.sub(r'[^a-z0-9\s]', '', str(t).lower()))
X_texts = df['Requirement_Text'].tolist()
y_ord = df['Requirement_Stability'].astype(np.int32).values.flatten() - 1

# 2) Embed texts
print("Loading HuggingFace model...")
embedder = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
print("Embedding...")
X_embeddings = embedder.encode(X_texts, batch_size=32, show_progress_bar=True)

# 3) Load best hyperparameters
with open('best_hyperparameters.pkl', 'rb') as f:
    best_hp_values = pickle.load(f)
print("\n✅ Loaded best hyperparameters:", best_hp_values)

# 4) Utility: Convert cumulative probabilities to predicted ordinal labels
def prob_to_label(cum_probs):
    return np.sum(cum_probs > 0.5, axis=1)

# 5) Stratified K-Fold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
mae_list, qwk_list, spearman_list = [], [], []

fold = 1
for train_idx, val_idx in kf.split(X_embeddings, y_ord):
    print(f"\n🚀 Fold {fold}/5")
    X_train, X_val = X_embeddings[train_idx], X_embeddings[val_idx]
    y_train, y_val = y_ord[train_idx], y_ord[val_idx]

    model = Sequential()
    model.add(Input(shape=(X_train.shape[1],)))
    model.add(Dense(best_hp_values['units_1'], activation='relu'))
    if best_hp_values['use_second']:
        model.add(Dense(best_hp_values['units_2'], activation='relu'))
    model.add(Dropout(best_hp_values['dropout']))
    model.add(CoralOrdinal(num_classes=5))

    model.compile(optimizer=tf.keras.optimizers.Adam(best_hp_values['learning_rate']), loss=OrdinalCrossEntropy())

    model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=30,
        batch_size=32,
        callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)],
        verbose=0
    )

    logits_val = model.predict(X_val)
    predicted_probs = tf.sigmoid(logits_val).numpy()
    y_pred = prob_to_label(predicted_probs)

    y_val_1b, y_pred_1b = y_val + 1, y_pred + 1

    mae = mean_absolute_error(y_val_1b, y_pred_1b)
    qwk = cohen_kappa_score(y_val_1b, y_pred_1b, weights='quadratic')
    spearman_corr, _ = spearmanr(y_val_1b, y_pred_1b)

    print(f"Fold {fold} - MAE: {mae:.3f}, QWK: {qwk:.3f}, Spearman: {spearman_corr:.3f}")
    mae_list.append(mae)
    qwk_list.append(qwk)
    spearman_list.append(spearman_corr)
    fold += 1

print("\n✅ Cross-validation complete!")
print(f"Average MAE: {np.mean(mae_list):.3f} ± {np.std(mae_list):.3f}")
print(f"Average QWK: {np.mean(qwk_list):.3f} ± {np.std(qwk_list):.3f}")
print(f"Average Spearman: {np.mean(spearman_list):.3f} ± {np.std(spearman_list):.3f}")

Loading HuggingFace model...
Embedding...


Batches:   0%|          | 0/10 [00:00<?, ?it/s]


✅ Loaded best hyperparameters: {'units_1': 224, 'use_second': False, 'dropout': 0.4, 'learning_rate': 0.008235855327692132, 'units_2': 32}

🚀 Fold 1/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
Fold 1 - MAE: 0.448, QWK: 0.205, Spearman: 0.291

🚀 Fold 2/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Fold 2 - MAE: 0.448, QWK: 0.496, Spearman: 0.585

🚀 Fold 3/5




[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 44ms/step



[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
Fold 3 - MAE: 0.483, QWK: 0.349, Spearman: 0.362

🚀 Fold 4/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Fold 4 - MAE: 0.500, QWK: 0.120, Spearman: 0.208

🚀 Fold 5/5
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
Fold 5 - MAE: 0.362, QWK: 0.516, Spearman: 0.628

✅ Cross-validation complete!
Average MAE: 0.448 ± 0.048
Average QWK: 0.337 ± 0.156
Average Spearman: 0.415 ± 0.165


In [9]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
from sentence_transformers import SentenceTransformer
import pickle
import json
from coral_ordinal import CoralOrdinal, OrdinalCrossEntropy
import openpyxl # Import openpyxl engine
# --- 1. Load expert-annotated dataset ---
df = pd.read_excel('Combined_Training_Data_Final.xlsx')
df['Requirement_Text'] = df['Requirement_Text'].apply(lambda t: re.sub(r'[^a-z0-9\s]', '', str(t).lower()))
X_texts = df['Requirement_Text'].tolist()
y_ord = df['Requirement_Stability'].astype(np.int32).values.flatten() - 1  # 0-based

# --- 2. Embed texts using online HuggingFace SentenceTransformer ---
print("Loading HuggingFace sentence-transformer model...")
embedder = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
print("Embedding all requirement statements...")
X_embeddings = embedder.encode(X_texts, batch_size=32, show_progress_bar=True)

# --- 3. Load best hyperparameters ---
with open('best_hyperparameters.pkl', 'rb') as f:
    best_hp_values = pickle.load(f)
print("\n✅ Loaded best hyperparameters:", best_hp_values)

# --- 4. Build final model with best hyperparameters ---
model = Sequential()
model.add(Input(shape=(X_embeddings.shape[1],)))
model.add(Dense(best_hp_values['units_1'], activation='relu'))
if best_hp_values['use_second']:
    model.add(Dense(best_hp_values['units_2'], activation='relu'))
model.add(Dropout(best_hp_values['dropout']))
model.add(CoralOrdinal(num_classes=5))

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=best_hp_values['learning_rate']),
    loss=OrdinalCrossEntropy()
)

# --- 5. Train final model on full dataset ---
print("\n🚀 Training final model on the full dataset...")
model.fit(
    X_embeddings, y_ord,
    epochs=30,
    batch_size=32,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)],
    verbose=1
)

# --- 6. Save model & artifacts ---
print("\n💾 Saving model and related files...")

# ✅ Save the model in native Keras format (Keras 3 compatible)
model.save('final_complexity_model.keras')

# ✅ Save hyperparameters
with open('final_best_hyperparameters.pkl', 'wb') as f:
    pickle.dump(best_hp_values, f)

# ✅ Save preprocessing details
preprocessing_info = {
    "label_offset": -1,
    "cleaning": "lowercase + remove non-alphanumerics",
    "embedding_model": "sentence-transformers/all-mpnet-base-v2 (online)"
}
with open('preprocessing_info.json', 'w') as f:
    json.dump(preprocessing_info, f, indent=2)

print("\n✅ All artifacts saved:")
print("  - final_complexity_model.keras")
print("  - final_best_hyperparameters.pkl")
print("  - preprocessing_info.json")

Loading HuggingFace sentence-transformer model...
Embedding all requirement statements...


Batches:   0%|          | 0/10 [00:00<?, ?it/s]


✅ Loaded best hyperparameters: {'units_1': 224, 'use_second': False, 'dropout': 0.4, 'learning_rate': 0.008235855327692132, 'units_2': 32}

🚀 Training final model on the full dataset...
Epoch 1/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - loss: 2.6435
Epoch 2/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 2.4367
Epoch 3/30


  current = self.get_monitor_value(logs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 2.3369
Epoch 4/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 2.2006  
Epoch 5/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 2.1036 
Epoch 6/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.9971 
Epoch 7/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.9017 
Epoch 8/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1.8496 
Epoch 9/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.7867 
Epoch 10/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 1.7217
Epoch 11/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.6335 
Epoch 12/30
[1m 1/10[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 73ms/step - loss: 1.5695

  current = self.get_monitor_value(logs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.5708 
Epoch 13/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1.5216 
Epoch 14/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1.4514 
Epoch 15/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1.4272 
Epoch 16/30
[1m 1/10[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 27ms/step - loss: 1.3536

  current = self.get_monitor_value(logs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1.3598 
Epoch 17/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 1.3538
Epoch 18/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1.2981 
Epoch 19/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.2784 
Epoch 20/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.2325 
Epoch 21/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.1861 
Epoch 22/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1.1513 
Epoch 23/30
[1m 1/10[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 81ms/step - loss: 1.1254

  current = self.get_monitor_value(logs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.1274 
Epoch 24/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.0720 
Epoch 25/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1.0729 
Epoch 26/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.0533 
Epoch 27/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.0444 
Epoch 28/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.0138 
Epoch 29/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.9809 
Epoch 30/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.9759 

💾 Saving model and related files...

✅ All artifacts saved:
  - final_complexity_model.keras
  - final_best_hyperparameters.pkl
  - preprocessing_in