In [2]:
# Step 1: Install necessary libraries
!pip install gensim transformers torch scikit-learn tqdm

import pandas as pd
from google.colab import files
import numpy as np
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, TensorDataset
import torch
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report
from huggingface_hub import hf_hub_download
from gensim.models import Doc2Vec
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertForSequenceClassification

# Upload the CSV file
uploaded = files.upload()

# Load the dataset into a DataFrame
df = pd.read_csv(next(iter(uploaded)))  # Assumes the first uploaded file is your dataset

# Map 'RequirementType' to 'labels' (Functional: 1, Non-Functional: 0)
label_mapping = {'F': 1, 'NF': 0}
df['labels'] = df['RequirementType'].map(label_mapping)

# Check if the 'labels' column was created correctly
print(df[['RequirementType', 'labels']].head())

# Download and load the fine-tuned Doc2Vec model from Hugging Face
model_path = hf_hub_download(repo_id="RafidMehda/doc2vec_model", filename="doc2vec_model")
doc2vec_model = Doc2Vec.load(model_path)

# Extract Doc2Vec embeddings for each document in the dataset
doc2vec_embeddings = [doc2vec_model.dv[str(i)] for i in range(len(df))]

# Load DistilBERT tokenizer and model for embeddings
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

def get_distilbert_embeddings(text):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)

    # Get DistilBERT embeddings
    with torch.no_grad():
        outputs = distilbert_model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        pooled_embedding = torch.mean(last_hidden_state, dim=1)  # Average pooling of token embeddings
    return pooled_embedding.squeeze().numpy()

# Generate DistilBERT embeddings for the dataset
distilbert_embeddings = [get_distilbert_embeddings(doc) for doc in df['content']]

# Combine Doc2Vec and DistilBERT embeddings
combined_embeddings = [np.concatenate((doc2vec_emb, distilbert_emb)) for doc2vec_emb, distilbert_emb in zip(doc2vec_embeddings, distilbert_embeddings)]

# Load the fine-tuned tokenizer and model for classification from Hugging Face
classifier_tokenizer = DistilBertTokenizer.from_pretrained('RafidMehda/app_review_model')
classifier_model = DistilBertForSequenceClassification.from_pretrained('RafidMehda/app_review_model')

# Move classifier model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
classifier_model.to(device)

# Define tenfold cross-validator
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Increase the batch size to 16
batch_size = 16

# List to store accuracy for each fold
fold_accuracies = []

# Helper function for prediction
def get_predictions(loader):
    predicted_classes = []
    classifier_model.eval()
    with torch.no_grad():
        for batch in tqdm(loader, desc="Processing Batches"):
            input_ids, attention_mask = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            outputs = classifier_model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            predicted_classes.append(preds.cpu())
    return torch.cat(predicted_classes)

# Cross-validation loop
for fold, (train_index, val_index) in enumerate(kf.split(df)):
    print(f"Fold {fold + 1}/{n_splits}")

    # Split into training and validation sets
    X_train, X_val = df['content'].iloc[train_index], df['content'].iloc[val_index]
    y_train, y_val = df['labels'].iloc[train_index], df['labels'].iloc[val_index]

    # Tokenize the text data for the classifier model
    train_inputs = classifier_tokenizer(list(X_train), return_tensors="pt", padding=True, truncation=True, max_length=128)
    val_inputs = classifier_tokenizer(list(X_val), return_tensors="pt", padding=True, truncation=True, max_length=128)

    # Move the tokenized inputs to the correct device (GPU/CPU)
    train_inputs = {key: val.to(device) for key, val in train_inputs.items()}
    val_inputs = {key: val.to(device) for key, val in val_inputs.items()}

    # Define DataLoaders for batching
    train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'])
    val_dataset = TensorDataset(val_inputs['input_ids'], val_inputs['attention_mask'])

    # Use batch size of 16
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Get predictions for the validation set
    val_preds = get_predictions(val_loader).cpu().numpy()

    # Evaluate accuracy for this fold
    fold_accuracy = accuracy_score(y_val, val_preds)
    fold_accuracies.append(fold_accuracy)

    print(f"Fold {fold + 1} Validation Accuracy: {fold_accuracy * 100:.2f}%")
    print(f"Fold {fold + 1} Classification Report:\n")
    print(classification_report(y_val, val_preds, target_names=['Non-Functional', 'Functional']))

# Calculate the average accuracy across all folds
average_accuracy = np.mean(fold_accuracies)
print(f"\nAverage Validation Accuracy across {n_splits} folds: {average_accuracy * 100:.2f}%")




Saving final_corrected_fine_labeled_reviews.csv to final_corrected_fine_labeled_reviews.csv
  RequirementType  labels
0               F       1
1              NF       0
2               F       1
3              NF       0
4              NF       0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


doc2vec_model:   0%|          | 0.00/10.6M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Fold 1/10


Processing Batches: 100%|██████████| 79/79 [00:05<00:00, 15.18it/s]


Fold 1 Validation Accuracy: 99.76%
Fold 1 Classification Report:

                precision    recall  f1-score   support

Non-Functional       1.00      1.00      1.00       690
    Functional       1.00      1.00      1.00       560

      accuracy                           1.00      1250
     macro avg       1.00      1.00      1.00      1250
  weighted avg       1.00      1.00      1.00      1250

Fold 2/10


Processing Batches: 100%|██████████| 79/79 [00:04<00:00, 19.64it/s]


Fold 2 Validation Accuracy: 99.28%
Fold 2 Classification Report:

                precision    recall  f1-score   support

Non-Functional       0.99      0.99      0.99       712
    Functional       0.99      0.99      0.99       538

      accuracy                           0.99      1250
     macro avg       0.99      0.99      0.99      1250
  weighted avg       0.99      0.99      0.99      1250

Fold 3/10


Processing Batches: 100%|██████████| 79/79 [00:04<00:00, 19.59it/s]


Fold 3 Validation Accuracy: 100.00%
Fold 3 Classification Report:

                precision    recall  f1-score   support

Non-Functional       1.00      1.00      1.00       680
    Functional       1.00      1.00      1.00       570

      accuracy                           1.00      1250
     macro avg       1.00      1.00      1.00      1250
  weighted avg       1.00      1.00      1.00      1250

Fold 4/10


Processing Batches: 100%|██████████| 79/79 [00:04<00:00, 19.62it/s]


Fold 4 Validation Accuracy: 99.84%
Fold 4 Classification Report:

                precision    recall  f1-score   support

Non-Functional       1.00      1.00      1.00       697
    Functional       1.00      1.00      1.00       553

      accuracy                           1.00      1250
     macro avg       1.00      1.00      1.00      1250
  weighted avg       1.00      1.00      1.00      1250

Fold 5/10


Processing Batches: 100%|██████████| 79/79 [00:04<00:00, 19.43it/s]


Fold 5 Validation Accuracy: 100.00%
Fold 5 Classification Report:

                precision    recall  f1-score   support

Non-Functional       1.00      1.00      1.00       709
    Functional       1.00      1.00      1.00       541

      accuracy                           1.00      1250
     macro avg       1.00      1.00      1.00      1250
  weighted avg       1.00      1.00      1.00      1250

Fold 6/10


Processing Batches: 100%|██████████| 79/79 [00:04<00:00, 19.13it/s]


Fold 6 Validation Accuracy: 99.84%
Fold 6 Classification Report:

                precision    recall  f1-score   support

Non-Functional       1.00      1.00      1.00       682
    Functional       1.00      1.00      1.00       567

      accuracy                           1.00      1249
     macro avg       1.00      1.00      1.00      1249
  weighted avg       1.00      1.00      1.00      1249

Fold 7/10


Processing Batches: 100%|██████████| 79/79 [00:04<00:00, 19.20it/s]


Fold 7 Validation Accuracy: 99.92%
Fold 7 Classification Report:

                precision    recall  f1-score   support

Non-Functional       1.00      1.00      1.00       680
    Functional       1.00      1.00      1.00       569

      accuracy                           1.00      1249
     macro avg       1.00      1.00      1.00      1249
  weighted avg       1.00      1.00      1.00      1249

Fold 8/10


Processing Batches: 100%|██████████| 79/79 [00:04<00:00, 19.10it/s]


Fold 8 Validation Accuracy: 99.92%
Fold 8 Classification Report:

                precision    recall  f1-score   support

Non-Functional       1.00      1.00      1.00       713
    Functional       1.00      1.00      1.00       536

      accuracy                           1.00      1249
     macro avg       1.00      1.00      1.00      1249
  weighted avg       1.00      1.00      1.00      1249

Fold 9/10


Processing Batches: 100%|██████████| 79/79 [00:04<00:00, 19.06it/s]


Fold 9 Validation Accuracy: 100.00%
Fold 9 Classification Report:

                precision    recall  f1-score   support

Non-Functional       1.00      1.00      1.00       692
    Functional       1.00      1.00      1.00       557

      accuracy                           1.00      1249
     macro avg       1.00      1.00      1.00      1249
  weighted avg       1.00      1.00      1.00      1249

Fold 10/10


Processing Batches: 100%|██████████| 79/79 [00:04<00:00, 18.87it/s]

Fold 10 Validation Accuracy: 100.00%
Fold 10 Classification Report:

                precision    recall  f1-score   support

Non-Functional       1.00      1.00      1.00       688
    Functional       1.00      1.00      1.00       561

      accuracy                           1.00      1249
     macro avg       1.00      1.00      1.00      1249
  weighted avg       1.00      1.00      1.00      1249


Average Validation Accuracy across 10 folds: 99.86%



