In [3]:
# Step 1: Install necessary libraries
!pip install gensim transformers torch scikit-learn tqdm

import pandas as pd
from google.colab import files

# Upload the CSV file
uploaded = files.upload()

# Load the dataset into a DataFrame
df = pd.read_csv(next(iter(uploaded)))  # Assumes the first uploaded file is your dataset

# Map 'RequirementType' to 'labels' (Functional: 1, Non-Functional: 0)
label_mapping = {'F': 1, 'NF': 0}
df['labels'] = df['RequirementType'].map(label_mapping)

# Check if the 'labels' column was created correctly
print(df[['RequirementType', 'labels']].head())

from huggingface_hub import hf_hub_download
from gensim.models import Doc2Vec

# Step 5: Download the model file from Hugging Face
model_path = hf_hub_download(repo_id="RafidMehda/doc2vec_model", filename="doc2vec_model")

# Step 6: Load the fine-tuned Doc2Vec model
doc2vec_model = Doc2Vec.load(model_path)

# Extract Doc2Vec embeddings for each document in the dataset
doc2vec_embeddings = [doc2vec_model.dv[str(i)] for i in range(len(df))]

from transformers import DistilBertTokenizer, DistilBertModel
import torch

# Step 8: Load DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

def get_distilbert_embeddings(text):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)  # Reduced max_length to 128

    # Get DistilBERT embeddings
    with torch.no_grad():
        outputs = distilbert_model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        pooled_embedding = torch.mean(last_hidden_state, dim=1)  # Average pooling of token embeddings
    return pooled_embedding.squeeze().numpy()

# Step 9: Generate DistilBERT embeddings for the dataset
distilbert_embeddings = [get_distilbert_embeddings(doc) for doc in df['content']]

# Combine Doc2Vec and DistilBERT embeddings
import numpy as np
combined_embeddings = [np.concatenate((doc2vec_emb, distilbert_emb)) for doc2vec_emb, distilbert_emb in zip(doc2vec_embeddings, distilbert_embeddings)]

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, classification_report

# Step 10: Split data into training (60%), validation (20%), and testing (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(df['content'], df['labels'], test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 50% of 40% = 20%

# Load the fine-tuned tokenizer and model from Hugging Face
tokenizer = DistilBertTokenizer.from_pretrained('RafidMehda/app_review_model')
model = DistilBertForSequenceClassification.from_pretrained('RafidMehda/app_review_model')

# Ensure the model is on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Ensure the model is in evaluation mode
model.eval()

# Tokenize the text data from X_train, X_val, and X_test with reduced max_length
train_inputs = tokenizer(list(X_train), return_tensors="pt", padding=True, truncation=True, max_length=128)  # Reduced max_length to 128
val_inputs = tokenizer(list(X_val), return_tensors="pt", padding=True, truncation=True, max_length=128)  # Reduced max_length to 128
test_inputs = tokenizer(list(X_test), return_tensors="pt", padding=True, truncation=True, max_length=128)  # Reduced max_length to 128

# Move the tokenized inputs to the correct device (GPU/CPU)
train_inputs = {key: val.to(device) for key, val in train_inputs.items()}
val_inputs = {key: val.to(device) for key, val in val_inputs.items()}
test_inputs = {key: val.to(device) for key, val in test_inputs.items()}

# Define DataLoaders for batching to avoid memory issues
batch_size = 4  # Smaller batch size for efficiency
train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'])
val_dataset = TensorDataset(val_inputs['input_ids'], val_inputs['attention_mask'])
test_dataset = TensorDataset(test_inputs['input_ids'], test_inputs['attention_mask'])

train_loader = DataLoader(train_dataset, batch_size=batch_size)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Import tqdm for progress bar
from tqdm import tqdm

# Optimized helper function for prediction
def get_predictions(loader):
    predicted_classes = []

    # Ensure the model is in evaluation mode and not computing gradients
    model.eval()

    # Use tqdm to show a progress bar
    with torch.no_grad():
        for batch in tqdm(loader, desc="Processing Batches"):
            input_ids, attention_mask = batch

            # Move inputs to the correct device (CPU/GPU)
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            # Perform model inference
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # Get the predicted classes
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            # Accumulate predictions
            predicted_classes.append(preds.cpu())  # Move predictions back to CPU for efficiency

    # Concatenate all the predicted classes into a single tensor
    return torch.cat(predicted_classes)

# Get predictions for all sets
train_preds = get_predictions(train_loader).cpu().numpy()
val_preds = get_predictions(val_loader).cpu().numpy()
test_preds = get_predictions(test_loader).cpu().numpy()

# Evaluate accuracy and classification report for all sets
print("Training Set Classification Report:\n")
print(classification_report(y_train, train_preds, target_names=['Non-Functional', 'Functional']))

print("Validation Set Classification Report:\n")
print(classification_report(y_val, val_preds, target_names=['Non-Functional', 'Functional']))

print("Test Set Classification Report:\n")
print(classification_report(y_test, test_preds, target_names=['Non-Functional', 'Functional']))

# Print accuracy scores for each set
train_accuracy = accuracy_score(y_train, train_preds)
val_accuracy = accuracy_score(y_val, val_preds)
test_accuracy = accuracy_score(y_test, test_preds)

print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")




Saving final_corrected_fine_labeled_reviews.csv to final_corrected_fine_labeled_reviews (2).csv
  RequirementType  labels
0               F       1
1              NF       0
2               F       1
3              NF       0
4              NF       0


Processing Batches: 100%|██████████| 1875/1875 [28:22<00:00,  1.10it/s]
Processing Batches: 100%|██████████| 625/625 [09:27<00:00,  1.10it/s]
Processing Batches: 100%|██████████| 625/625 [09:25<00:00,  1.11it/s]

Training Set Classification Report:

                precision    recall  f1-score   support

Non-Functional       1.00      1.00      1.00      4166
    Functional       1.00      1.00      1.00      3331

      accuracy                           1.00      7497
     macro avg       1.00      1.00      1.00      7497
  weighted avg       1.00      1.00      1.00      7497

Validation Set Classification Report:

                precision    recall  f1-score   support

Non-Functional       1.00      1.00      1.00      1411
    Functional       0.99      1.00      1.00      1088

      accuracy                           1.00      2499
     macro avg       1.00      1.00      1.00      2499
  weighted avg       1.00      1.00      1.00      2499

Test Set Classification Report:

                precision    recall  f1-score   support

Non-Functional       1.00      1.00      1.00      1366
    Functional       1.00      1.00      1.00      1133

      accuracy                           1.


