In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA
from transformers import AutoTokenizer, AutoModel
from gensim.models import Doc2Vec
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from huggingface_hub import hf_hub_download
from tqdm import tqdm
from google.colab import files

# Upload the CSV file
uploaded = files.upload()

# Load the dataset into a DataFrame
df = pd.read_csv(next(iter(uploaded)))  # Assumes the first uploaded file is your dataset

# Map 'RequirementType' to 'labels' (Functional: 1, Non-Functional: 0)
label_mapping = {'F': 1, 'NF': 0}
df['labels'] = df['RequirementType'].map(label_mapping)

# Check if the 'labels' column was created correctly
print(df[['RequirementType', 'labels']].head())

# Step 5: Download and load the Doc2Vec model from Hugging Face
model_path = hf_hub_download(repo_id="RafidMehda/doc2vec_model", filename="doc2vec_model")
doc2vec_model = Doc2Vec.load(model_path)

# Extract Doc2Vec embeddings for each document in the dataset
def get_doc2vec_embeddings(index):
    doc2vec_emb = doc2vec_model.dv[str(index)]
    return torch.tensor(doc2vec_emb).numpy()

doc2vec_embeddings = [get_doc2vec_embeddings(i) for i in range(len(df))]

# Load tokenizer and model from the fine-tuned Hugging Face model
tokenizer = AutoTokenizer.from_pretrained("RafidMehda/fined-distilBERT")
model = AutoModel.from_pretrained("RafidMehda/fined-distilBERT")

# Function to get embeddings from the fine-tuned model with average pooling
def get_finetuned_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        pooled_embedding = torch.mean(last_hidden_state, dim=1)  # Average pooling
    return pooled_embedding.squeeze().numpy()

# Generate embeddings using the fine-tuned model for the dataset
finetuned_embeddings = [get_finetuned_embeddings(doc) for doc in df['content']]

# Combine Doc2Vec and fine-tuned model embeddings
combined_embeddings = [np.concatenate((doc2vec_emb, finetuned_emb)) for doc2vec_emb, finetuned_emb in zip(doc2vec_embeddings, finetuned_embeddings)]

# Convert to numpy arrays for input
X = np.array(combined_embeddings)
y = df['labels'].values

# Use PCA to reduce the dimensionality (if needed)
pca = PCA(n_components=200)  # Reduce to 200 dimensions
X_reduced = pca.fit_transform(X)

# Convert the reduced embeddings to torch tensors
X_tensor = torch.tensor(X_reduced).float()
y_tensor = torch.tensor(y).long()

# Split data into training, validation, and test sets (70% train, 15% val, 15% test)
X_train, X_temp, y_train, y_temp = train_test_split(X_reduced, y, test_size=0.30, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)

# Define a neural network classifier with reduced units and increased dropout
class CombinedEmbeddingClassifier(nn.Module):
    def __init__(self, input_dim, num_labels):
        super(CombinedEmbeddingClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)  # Reduced from 512 to 256
        self.dropout1 = nn.Dropout(0.5)  # Increased dropout to 0.5
        self.fc2 = nn.Linear(256, 128)  # Reduced from 256 to 128
        self.dropout2 = nn.Dropout(0.5)  # Increased dropout to 0.5
        self.fc3 = nn.Linear(128, num_labels)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

# Instantiate the classifier model
input_dim = X_reduced.shape[1]  # The size of the combined embeddings (after PCA)
num_labels = 2  # We have two labels: Functional and Non-Functional
model = CombinedEmbeddingClassifier(input_dim=input_dim, num_labels=num_labels)

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)  # AdamW optimizer with weight decay
loss_fn = nn.CrossEntropyLoss()

# Create PyTorch datasets and dataloaders
train_dataset = TensorDataset(torch.tensor(X_train).float(), torch.tensor(y_train).long())
val_dataset = TensorDataset(torch.tensor(X_val).float(), torch.tensor(y_val).long())
test_dataset = TensorDataset(torch.tensor(X_test).float(), torch.tensor(y_test).long())

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Training function
def train_model(epochs=3):  # Reduced epochs to 3 instead of 5
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(train_loader):
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = model(inputs)
            loss = loss_fn(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f'Epoch {epoch + 1}/{epochs} | Loss: {total_loss/len(train_loader):.4f}')

# Train the classifier
train_model(epochs=3)  # Train for 3 epochs instead of 5

# Evaluation function
def evaluate_model(dataloader, set_name):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            loss = loss_fn(outputs, labels)

            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    print(f"{set_name} Accuracy: {accuracy * 100:.2f}%")
    print(f"{set_name} Classification Report:\n")
    print(classification_report(all_labels, all_preds, target_names=['Non-Functional', 'Functional']))
    return accuracy

# Evaluate on training, validation, and test sets
print("Training Set Classification Report:")
train_accuracy = evaluate_model(train_loader, "Training")

print("\nValidation Set Classification Report:")
val_accuracy = evaluate_model(val_loader, "Validation")

print("\nTest Set Classification Report:")
test_accuracy = evaluate_model(test_loader, "Test")

# Print final accuracies
print(f"\nTraining Accuracy: {train_accuracy * 100:.2f}%")
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Saving final_corrected_fine_labeled_reviews.csv to final_corrected_fine_labeled_reviews (3).csv
  RequirementType  labels
0               F       1
1              NF       0
2               F       1
3              NF       0
4              NF       0


tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

100%|██████████| 274/274 [00:00<00:00, 310.28it/s]


Epoch 1/3 | Loss: 0.3294


100%|██████████| 274/274 [00:01<00:00, 244.61it/s]


Epoch 2/3 | Loss: 0.1125


100%|██████████| 274/274 [00:01<00:00, 222.06it/s]


Epoch 3/3 | Loss: 0.0925
Training Set Classification Report:
Training Accuracy: 97.21%
Training Classification Report:

                precision    recall  f1-score   support

Non-Functional       0.98      0.97      0.97      4862
    Functional       0.96      0.97      0.97      3884

      accuracy                           0.97      8746
     macro avg       0.97      0.97      0.97      8746
  weighted avg       0.97      0.97      0.97      8746


Validation Set Classification Report:
Validation Accuracy: 96.42%
Validation Classification Report:

                precision    recall  f1-score   support

Non-Functional       0.98      0.96      0.97      1045
    Functional       0.95      0.97      0.96       829

      accuracy                           0.96      1874
     macro avg       0.96      0.97      0.96      1874
  weighted avg       0.96      0.96      0.96      1874


Test Set Classification Report:
Test Accuracy: 95.95%
Test Classification Report:

                