In [3]:
# Install necessary libraries
!pip install gensim transformers torch scikit-learn xgboost nltk

import pandas as pd
import numpy as np
import random
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from transformers import AutoTokenizer, AutoModel
from gensim.models import Doc2Vec
from tqdm import tqdm
import torch
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
from sklearn.utils.class_weight import compute_class_weight
from huggingface_hub import hf_hub_download
from google.colab import files

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Download NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string

# Upload the CSV file
uploaded = files.upload()

# Load the dataset into a DataFrame
df = pd.read_csv(next(iter(uploaded)))  # Assumes the first uploaded file is your dataset

# Map 'RequirementType' to 'labels' (Functional: 1, Non-Functional: 0)
label_mapping = {'F': 1, 'NF': 0}
df['labels'] = df['RequirementType'].map(label_mapping)

# Check if the 'labels' column was created correctly
print(df[['RequirementType', 'labels']].head())

# Text preprocessing function
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    # Join back to string
    text = ' '.join(words)
    return text

# Apply preprocessing to the 'content' column
df['cleaned_content'] = df['content'].apply(preprocess_text)

# Check class distribution
print("Original class distribution:")
print(df['labels'].value_counts())

# Split data into train, validation, and test sets (70% train, 15% validation, 15% test)
y = df['labels'].values
df_train, df_temp, y_train, y_temp = train_test_split(df, y, test_size=0.3, stratify=y, random_state=42)
df_val, df_test, y_val, y_test = train_test_split(df_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Download and load the fine-tuned Doc2Vec model from Hugging Face
model_path = hf_hub_download(repo_id="RafidMehda/doc2vec_model", filename="doc2vec_model")
doc2vec_model = Doc2Vec.load(model_path)

# Function to get Doc2Vec embeddings
def get_doc2vec_embeddings(text):
    words = text.split()  # Assuming text is already preprocessed
    return doc2vec_model.infer_vector(words)

# Generate Doc2Vec embeddings for training, validation, and test sets
print("Generating Doc2Vec embeddings...")
doc2vec_embeddings_train = [get_doc2vec_embeddings(doc) for doc in df_train['cleaned_content']]
doc2vec_embeddings_val = [get_doc2vec_embeddings(doc) for doc in df_val['cleaned_content']]
doc2vec_embeddings_test = [get_doc2vec_embeddings(doc) for doc in df_test['cleaned_content']]

# Load tokenizer and fine-tuned DistilBERT model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("RafidMehda/fined-distilBERT")
hf_model = AutoModel.from_pretrained("RafidMehda/fined-distilBERT")

# Function to get DistilBERT embeddings with alternative pooling strategies
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = hf_model(**inputs)
        hidden_states = outputs.last_hidden_state  # (batch_size, sequence_length, hidden_size)
        cls_embedding = hidden_states[:,0,:]  # CLS token embedding
        mean_pooling = torch.mean(hidden_states, dim=1)
        max_pooling = torch.max(hidden_states, dim=1).values
    return cls_embedding.squeeze().numpy(), mean_pooling.squeeze().numpy(), max_pooling.squeeze().numpy()

# Generate embeddings for training set
cls_embeddings_train = []
mean_embeddings_train = []
max_embeddings_train = []

print("Generating DistilBERT embeddings for training set...")
for text in tqdm(df_train['cleaned_content']):
    cls_emb, mean_emb, max_emb = get_embeddings(text)
    cls_embeddings_train.append(cls_emb)
    mean_embeddings_train.append(mean_emb)
    max_embeddings_train.append(max_emb)

# Generate embeddings for validation set
cls_embeddings_val = []
mean_embeddings_val = []
max_embeddings_val = []

print("Generating DistilBERT embeddings for validation set...")
for text in tqdm(df_val['cleaned_content']):
    cls_emb, mean_emb, max_emb = get_embeddings(text)
    cls_embeddings_val.append(cls_emb)
    mean_embeddings_val.append(mean_emb)
    max_embeddings_val.append(max_emb)

# Generate embeddings for test set
cls_embeddings_test = []
mean_embeddings_test = []
max_embeddings_test = []

print("Generating DistilBERT embeddings for test set...")
for text in tqdm(df_test['cleaned_content']):
    cls_emb, mean_emb, max_emb = get_embeddings(text)
    cls_embeddings_test.append(cls_emb)
    mean_embeddings_test.append(mean_emb)
    max_embeddings_test.append(max_emb)

# Convert embeddings to numpy arrays
doc2vec_embeddings_train = np.array(doc2vec_embeddings_train)
cls_embeddings_train = np.array(cls_embeddings_train)
mean_embeddings_train = np.array(mean_embeddings_train)
max_embeddings_train = np.array(max_embeddings_train)

doc2vec_embeddings_val = np.array(doc2vec_embeddings_val)
cls_embeddings_val = np.array(cls_embeddings_val)
mean_embeddings_val = np.array(mean_embeddings_val)
max_embeddings_val = np.array(max_embeddings_val)

doc2vec_embeddings_test = np.array(doc2vec_embeddings_test)
cls_embeddings_test = np.array(cls_embeddings_test)
mean_embeddings_test = np.array(mean_embeddings_test)
max_embeddings_test = np.array(max_embeddings_test)

# Combine embeddings
combined_embeddings_train = np.concatenate((doc2vec_embeddings_train, cls_embeddings_train, mean_embeddings_train, max_embeddings_train), axis=1)
combined_embeddings_val = np.concatenate((doc2vec_embeddings_val, cls_embeddings_val, mean_embeddings_val, max_embeddings_val), axis=1)
combined_embeddings_test = np.concatenate((doc2vec_embeddings_test, cls_embeddings_test, mean_embeddings_test, max_embeddings_test), axis=1)

# Apply PCA for dimensionality reduction (retain 95% variance)
pca = PCA(0.95, random_state=42)
pca.fit(combined_embeddings_train)

X_train_pca = pca.transform(combined_embeddings_train)
X_val_pca = pca.transform(combined_embeddings_val)
X_test_pca = pca.transform(combined_embeddings_test)

# Compute class weights for handling class imbalance
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i : class_weights[i] for i in range(len(class_weights))}
print("Class weights:", class_weight_dict)

# Initialize XGBoost classifier with class weights
xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    scale_pos_weight=class_weight_dict[0]/class_weight_dict[1],
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
xgb_model.fit(X_train_pca, y_train)

# Function to compute predictions and evaluate performance
def evaluate_model(model, X, y, dataset_name):
    y_pred = model.predict(X)
    y_pred_proba = model.predict_proba(X)[:,1] if hasattr(model, "predict_proba") else None
    accuracy = accuracy_score(y, y_pred)
    report = classification_report(y, y_pred, target_names=['Non-Functional', 'Functional'])
    print(f"\n{dataset_name} Classification Report:")
    print(report)
    print(f"{dataset_name} Accuracy: {accuracy * 100:.2f}%")
    if y_pred_proba is not None:
        auc = roc_auc_score(y, y_pred_proba)
        print(f"{dataset_name} AUC-ROC: {auc:.2f}")
    cm = confusion_matrix(y, y_pred)
    print(f"{dataset_name} Confusion Matrix:\n{cm}")

# Evaluate XGBoost model on validation and test sets
print("\nEvaluating XGBoost Model on Validation Set")
evaluate_model(xgb_model, X_val_pca, y_val, "Validation Set")

print("\nEvaluating XGBoost Model on Test Set")
evaluate_model(xgb_model, X_test_pca, y_test, "Test Set")

# Ensemble modeling using VotingClassifier
lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
svc = SVC(probability=True, class_weight='balanced', random_state=42)
rf = RandomForestClassifier(class_weight='balanced', random_state=42)

# Fit individual models
lr.fit(X_train_pca, y_train)
svc.fit(X_train_pca, y_train)
rf.fit(X_train_pca, y_train)

voting_clf = VotingClassifier(
    estimators=[('lr', lr), ('svc', svc), ('rf', rf), ('xgb', xgb_model)],
    voting='soft'
)
voting_clf.fit(X_train_pca, y_train)

# Evaluate Voting Classifier on validation and test sets
print("\nEvaluating Voting Classifier on Validation Set")
evaluate_model(voting_clf, X_val_pca, y_val, "Validation Set")

print("\nEvaluating Voting Classifier on Test Set")
evaluate_model(voting_clf, X_test_pca, y_test, "Test Set")

# Ensemble modeling using StackingClassifier
estimators = [
    ('lr', lr),
    ('svc', svc),
    ('rf', rf)
]

stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(class_weight='balanced', random_state=42),
    cv=5
)
stacking_clf.fit(X_train_pca, y_train)

# Evaluate Stacking Classifier on validation and test sets
print("\nEvaluating Stacking Classifier on Validation Set")
evaluate_model(stacking_clf, X_val_pca, y_val, "Validation Set")

print("\nEvaluating Stacking Classifier on Test Set")
evaluate_model(stacking_clf, X_test_pca, y_test, "Test Set")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Saving final_corrected_fine_labeled_reviews.csv to final_corrected_fine_labeled_reviews (1).csv
  RequirementType  labels
0               F       1
1              NF       0
2               F       1
3              NF       0
4              NF       0
Original class distribution:
labels
0    6943
1    5552
Name: count, dtype: int64


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


doc2vec_model:   0%|          | 0.00/10.6M [00:00<?, ?B/s]

Generating Doc2Vec embeddings...


tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Generating DistilBERT embeddings for training set...


100%|██████████| 8746/8746 [09:10<00:00, 15.90it/s]


Generating DistilBERT embeddings for validation set...


100%|██████████| 1874/1874 [01:58<00:00, 15.80it/s]


Generating DistilBERT embeddings for test set...


100%|██████████| 1875/1875 [01:57<00:00, 15.90it/s]


Class weights: {0: 0.8997942386831276, 1: 1.1253216675244466}


Parameters: { "use_label_encoder" } are not used.




Evaluating XGBoost Model on Validation Set

Validation Set Classification Report:
                precision    recall  f1-score   support

Non-Functional       0.93      0.91      0.92      1041
    Functional       0.89      0.91      0.90       833

      accuracy                           0.91      1874
     macro avg       0.91      0.91      0.91      1874
  weighted avg       0.91      0.91      0.91      1874

Validation Set Accuracy: 91.36%
Validation Set AUC-ROC: 0.97
Validation Set Confusion Matrix:
[[951  90]
 [ 72 761]]

Evaluating XGBoost Model on Test Set

Test Set Classification Report:
                precision    recall  f1-score   support

Non-Functional       0.92      0.92      0.92      1042
    Functional       0.90      0.90      0.90       833

      accuracy                           0.91      1875
     macro avg       0.91      0.91      0.91      1875
  weighted avg       0.91      0.91      0.91      1875

Test Set Accuracy: 91.15%
Test Set AUC-ROC: 0.97
Te

Parameters: { "use_label_encoder" } are not used.




Evaluating Voting Classifier on Validation Set

Validation Set Classification Report:
                precision    recall  f1-score   support

Non-Functional       0.96      0.88      0.91      1041
    Functional       0.86      0.95      0.90       833

      accuracy                           0.91      1874
     macro avg       0.91      0.91      0.91      1874
  weighted avg       0.91      0.91      0.91      1874

Validation Set Accuracy: 90.93%
Validation Set AUC-ROC: 0.97
Validation Set Confusion Matrix:
[[914 127]
 [ 43 790]]

Evaluating Voting Classifier on Test Set

Test Set Classification Report:
                precision    recall  f1-score   support

Non-Functional       0.94      0.88      0.91      1042
    Functional       0.86      0.94      0.90       833

      accuracy                           0.91      1875
     macro avg       0.90      0.91      0.91      1875
  weighted avg       0.91      0.91      0.91      1875

Test Set Accuracy: 90.56%
Test Set AUC-ROC: