In [1]:
# Import necessary libraries
!pip install gensim transformers torch scikit-learn tqdm xgboost

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModel
from gensim.models import Doc2Vec
from tqdm import tqdm
import torch  # Make sure to import torch for Hugging Face models
from xgboost import XGBClassifier  # XGBoost for classification
from huggingface_hub import hf_hub_download
from google.colab import files

# Upload the CSV file
uploaded = files.upload()

# Load the dataset into a DataFrame
df = pd.read_csv(next(iter(uploaded)))  # Assumes the first uploaded file is your dataset

# Map 'RequirementType' to 'labels' (Functional: 1, Non-Functional: 0)
label_mapping = {'F': 1, 'NF': 0}
df['labels'] = df['RequirementType'].map(label_mapping)

# Check if the 'labels' column was created correctly
print(df[['RequirementType', 'labels']].head())

# Download and load the Doc2Vec model from Hugging Face
model_path = hf_hub_download(repo_id="RafidMehda/doc2vec_model", filename="doc2vec_model")
doc2vec_model = Doc2Vec.load(model_path)

# Extract Doc2Vec embeddings for each document in the dataset
def get_doc2vec_embeddings(index):
    doc2vec_emb = doc2vec_model.dv[str(index)]
    return doc2vec_emb

doc2vec_embeddings = [get_doc2vec_embeddings(i) for i in range(len(df))]

# Load tokenizer and model from the fine-tuned Hugging Face model
tokenizer = AutoTokenizer.from_pretrained("RafidMehda/fined-distilBERT")
hf_model = AutoModel.from_pretrained("RafidMehda/fined-distilBERT")

# Function to get embeddings from the fine-tuned model with average pooling
def get_finetuned_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():  # Use torch for no_grad() to prevent gradient calculation
        outputs = hf_model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        pooled_embedding = torch.mean(last_hidden_state, dim=1)  # Average pooling
    return pooled_embedding.squeeze().numpy()

# Generate embeddings using the fine-tuned model for the dataset
finetuned_embeddings = [get_finetuned_embeddings(doc) for doc in df['content']]

# Combine Doc2Vec and fine-tuned model embeddings
combined_embeddings = [np.concatenate((doc2vec_emb, finetuned_emb)) for doc2vec_emb, finetuned_emb in zip(doc2vec_embeddings, finetuned_embeddings)]

# Convert to numpy arrays for input
X = np.array(combined_embeddings)
y = df['labels'].values

# Split data into train, validation, and test sets (70% train, 15% validation, 15% test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize and train XGBoost classifier
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, scale_pos_weight=1, use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)

# Function to compute predictions and evaluate performance
def get_predictions_and_evaluate(X, y, dataset_name):
    y_pred = xgb_model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    print(f"\n{dataset_name} Classification Report:")
    print(classification_report(y, y_pred, target_names=['Non-Functional', 'Functional']))
    print(f"\n{dataset_name} Accuracy: {accuracy * 100:.2f}%")

# Evaluate on the train, validation, and test sets
get_predictions_and_evaluate(X_train, y_train, "Training Set")
get_predictions_and_evaluate(X_val, y_val, "Validation Set")
get_predictions_and_evaluate(X_test, y_test, "Test Set")




Saving final_corrected_fine_labeled_reviews.csv to final_corrected_fine_labeled_reviews.csv
  RequirementType  labels
0               F       1
1              NF       0
2               F       1
3              NF       0
4              NF       0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


doc2vec_model:   0%|          | 0.00/10.6M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Parameters: { "use_label_encoder" } are not used.




Training Set Classification Report:
                precision    recall  f1-score   support

Non-Functional       1.00      1.00      1.00      4862
    Functional       1.00      1.00      1.00      3884

      accuracy                           1.00      8746
     macro avg       1.00      1.00      1.00      8746
  weighted avg       1.00      1.00      1.00      8746


Training Set Accuracy: 100.00%

Validation Set Classification Report:
                precision    recall  f1-score   support

Non-Functional       0.97      0.98      0.97      1045
    Functional       0.97      0.97      0.97       829

      accuracy                           0.97      1874
     macro avg       0.97      0.97      0.97      1874
  weighted avg       0.97      0.97      0.97      1874


Validation Set Accuracy: 97.17%

Test Set Classification Report:
                precision    recall  f1-score   support

Non-Functional       0.98      0.98      0.98      1036
    Functional       0.98      0.97