In [1]:
# Step 1: Install necessary libraries
!pip install gensim scikit-learn huggingface_hub

import pandas as pd
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
from google.colab import files

# Step 2: Upload the CSV file in Google Colab
uploaded = files.upload()  # Opens a file dialog for file upload

# Step 3: Load the dataset into a DataFrame
df = pd.read_csv(next(iter(uploaded)))  # Load the uploaded file into a DataFrame

# Step 4: Download NLTK resources
nltk.download('punkt')

# Step 5: Map labels to integers (Functional: 1, Non-Functional: 0)
label_mapping = {'F': 1, 'NF': 0}
df['labels'] = df['RequirementType'].map(label_mapping)

# Step 6: Prepare tagged documents for Doc2Vec
tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()), tags=[str(i)]) for i, doc in enumerate(df['content'])]

from gensim.models import Doc2Vec

# Initialize the Doc2Vec model
model = Doc2Vec(vector_size=100, window=5, min_count=2, workers=4, epochs=20)

# Build the vocabulary from the tagged documents
model.build_vocab(tagged_data)

# Train the model
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

# Extract document vectors
doc_vectors = [model.dv[str(i)] for i in range(len(tagged_data))]

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Step 7: Split data into training (60%), validation (20%), and testing (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(doc_vectors, df['labels'], test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 50% of 40% = 20%

# Step 8: Train a logistic regression classifier on the training set
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Step 9: Make predictions for all sets
y_train_pred = classifier.predict(X_train)
y_val_pred = classifier.predict(X_val)
y_test_pred = classifier.predict(X_test)

# Step 10: Evaluate and print classification report for all sets
print("Training Set Classification Report:\n")
print(classification_report(y_train, y_train_pred, target_names=['Non-Functional', 'Functional']))

print("Validation Set Classification Report:\n")
print(classification_report(y_val, y_val_pred, target_names=['Non-Functional', 'Functional']))

print("Test Set Classification Report:\n")
print(classification_report(y_test, y_test_pred, target_names=['Non-Functional', 'Functional']))

# Optional: Print accuracy for each set
train_accuracy = accuracy_score(y_train, y_train_pred)
val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")






Saving final_corrected_fine_labeled_reviews.csv to final_corrected_fine_labeled_reviews.csv


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Training Set Classification Report:

                precision    recall  f1-score   support

Non-Functional       0.74      0.83      0.78      4166
    Functional       0.75      0.64      0.69      3331

      accuracy                           0.74      7497
     macro avg       0.74      0.73      0.74      7497
  weighted avg       0.74      0.74      0.74      7497

Validation Set Classification Report:

                precision    recall  f1-score   support

Non-Functional       0.74      0.82      0.78      1411
    Functional       0.73      0.62      0.67      1088

      accuracy                           0.73      2499
     macro avg       0.73      0.72      0.72      2499
  weighted avg       0.73      0.73      0.73      2499

Test Set Classification Report:

                precision    recall  f1-score   support

Non-Functional       0.74      0.82      0.77      1366
    Functional       0.74      0.65      0.69      1133

      accuracy                           0.