In [1]:
!pip install gensim transformers torch scikit-learn tqdm xgboost nltk
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Doc2Vec
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from xgboost import XGBClassifier
from huggingface_hub import hf_hub_download




In [2]:
# Import required library
from google.colab import files

# Upload the file
uploaded = files.upload()

# Assuming the uploaded file is named 'reviews.csv'
df = pd.read_csv(next(iter(uploaded)))  # Load the uploaded CSV into a DataFrame

# Map 'RequirementType' to 'labels' (Functional: 1, Non-Functional: 0)
label_mapping = {'F': 1, 'NF': 0}
df['labels'] = df['RequirementType'].map(label_mapping)

# Tokenize documents into sentences
nltk.download('punkt')
df['sentences'] = df['content'].apply(sent_tokenize)

# Display sample
print(df[['content', 'sentences', 'labels']].head())


Saving final_corrected_fine_labeled_reviews.csv to final_corrected_fine_labeled_reviews.csv


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


                                             content  \
0                      I cannot open the app anymore   
1  I have been begging for a refund from this app...   
2  Very costly for the premium version (approx In...   
3  Used to keep me organized, but all the 2020 UP...   
4                                Dan Birthday Oct 28   

                                           sentences  labels  
0                    [I cannot open the app anymore]       1  
1  [I have been begging for a refund from this ap...       0  
2  [Very costly for the premium version (approx I...       1  
3  [Used to keep me organized, but all the 2020 U...       0  
4                              [Dan Birthday Oct 28]       0  


In [3]:
# Download and load the Doc2Vec model from Hugging Face
model_path = hf_hub_download(repo_id="RafidMehda/doc2vec_model", filename="doc2vec_model")
doc2vec_model = Doc2Vec.load(model_path)

# Load pre-trained DistilBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("RafidMehda/fined-distilBERT")
distilbert_model = AutoModel.from_pretrained("RafidMehda/fined-distilBERT")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


doc2vec_model:   0%|          | 0.00/10.6M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [4]:
# Function to get Doc2Vec embeddings
def get_doc2vec_embeddings(index):
    doc2vec_emb = doc2vec_model.dv[str(index)]
    return doc2vec_emb

# Apply the function to extract Doc2Vec embeddings
doc2vec_embeddings = [get_doc2vec_embeddings(i) for i in range(len(df))]


In [5]:
# Function to get DistilBERT embeddings for sentences in a document
def get_distilbert_sentence_embeddings(sentences):
    sentence_embeddings = []
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)
        with torch.no_grad():
            outputs = distilbert_model(**inputs)
            last_hidden_state = outputs.last_hidden_state
            pooled_embedding = torch.mean(last_hidden_state, dim=1)  # Average pooling
        sentence_embeddings.append(pooled_embedding.squeeze().numpy())
    return sentence_embeddings

# Apply the function to get DistilBERT embeddings for each document's sentences
df['sentence_embeddings'] = df['sentences'].apply(get_distilbert_sentence_embeddings)


In [6]:
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn_weights = nn.Parameter(torch.Tensor(hidden_dim, 1))
        nn.init.xavier_uniform_(self.attn_weights)

    def forward(self, hidden_states):
        # hidden_states shape: (batch_size, num_sentences, hidden_dim)
        attn_scores = torch.tanh(torch.matmul(hidden_states, self.attn_weights)).squeeze(-1)
        attn_weights = F.softmax(attn_scores, dim=1)
        weighted_sum = torch.bmm(attn_weights.unsqueeze(1), hidden_states).squeeze(1)
        return weighted_sum, attn_weights


In [9]:
import torch

# Define the attention model for sentence-level embeddings
sentence_attention = Attention(hidden_dim=768)  # Assuming DistilBERT's hidden dim is 768

# Convert the list of sentence embeddings into a tensor in one step
sentence_attention_outputs = []
for emb_list in df['sentence_embeddings']:
    # Convert list of numpy arrays to a single numpy array, then to tensor
    sentence_embs = torch.tensor(np.array(emb_list))  # Efficient tensor conversion

    # Ensure the tensor has the shape (num_sentences, hidden_dim)
    if len(sentence_embs.shape) == 2:  # Correct shape (num_sentences, hidden_dim)
        sentence_embs = sentence_embs.unsqueeze(0)  # Add batch dimension if necessary

    # Apply attention mechanism
    weighted_sum, _ = sentence_attention(sentence_embs)

    # Store the attention-weighted sentence embedding (detach from computation graph)
    sentence_attention_outputs.append(weighted_sum.detach().numpy())


In [11]:
# Concatenate Doc2Vec and attention-based DistilBERT embeddings
combined_embeddings = []  # List to store the combined embeddings

# Iterate over the Doc2Vec embeddings and sentence attention outputs
for doc2vec_emb, sentence_attn_emb in zip(doc2vec_embeddings, sentence_attention_outputs):
    # Flatten the sentence attention embedding (if it's 2D)
    sentence_attn_emb_flat = sentence_attn_emb.flatten()  # Flatten the 2D array to 1D

    # Concatenate the flattened sentence attention embedding with the 1D Doc2Vec embedding
    combined_embedding = np.concatenate((doc2vec_emb, sentence_attn_emb_flat))

    # Append the result to the list
    combined_embeddings.append(combined_embedding)

# Convert to numpy arrays for input
X = np.array(combined_embeddings)

# Labels from the dataset (already prepared earlier in your code)
y = df['labels'].values

# Split data into train, validation, and test sets (70% train, 15% validation, 15% test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Now X_train, X_val, X_test are the features (combined embeddings),
# and y_train, y_val, y_test are the labels (Functional vs. Non-Functional).


In [12]:
# Initialize and train XGBoost classifier
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, scale_pos_weight=1, use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)

# Function to compute predictions and evaluate performance
def get_predictions_and_evaluate(X, y, dataset_name):
    y_pred = xgb_model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    print(f"\n{dataset_name} Classification Report:")
    print(classification_report(y, y_pred, target_names=['Non-Functional', 'Functional']))
    print(f"\n{dataset_name} Accuracy: {accuracy * 100:.2f}%")

# Evaluate on the train, validation, and test sets
get_predictions_and_evaluate(X_train, y_train, "Training Set")
get_predictions_and_evaluate(X_val, y_val, "Validation Set")
get_predictions_and_evaluate(X_test, y_test, "Test Set")


Parameters: { "use_label_encoder" } are not used.




Training Set Classification Report:
                precision    recall  f1-score   support

Non-Functional       1.00      1.00      1.00      4862
    Functional       1.00      1.00      1.00      3884

      accuracy                           1.00      8746
     macro avg       1.00      1.00      1.00      8746
  weighted avg       1.00      1.00      1.00      8746


Training Set Accuracy: 100.00%

Validation Set Classification Report:
                precision    recall  f1-score   support

Non-Functional       0.98      0.97      0.98      1045
    Functional       0.97      0.97      0.97       829

      accuracy                           0.97      1874
     macro avg       0.97      0.97      0.97      1874
  weighted avg       0.97      0.97      0.97      1874


Validation Set Accuracy: 97.28%

Test Set Classification Report:
                precision    recall  f1-score   support

Non-Functional       0.98      0.98      0.98      1036
    Functional       0.97      0.97