In [1]:
!pip install gensim




In [13]:
import pandas as pd
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
from google.colab import files

# Step 1: Upload the CSV file in Google Colab
uploaded = files.upload()  # Opens a file dialog for file upload

# Step 2: Load the dataset into a DataFrame
df = pd.read_csv(next(iter(uploaded)))  # Load the uploaded file into a DataFrame

# Step 3: Download NLTK resources
nltk.download('punkt')

# Step 4: Map labels to integers (Functional: 1, Non-Functional: 0)
label_mapping = {'F': 1, 'NF': 0}
df['labels'] = df['RequirementType'].map(label_mapping)

# Step 5: Prepare tagged documents for Doc2Vec
tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()), tags=[str(i)]) for i, doc in enumerate(df['content'])]


Saving final_corrected_fine_labeled_reviews.csv to final_corrected_fine_labeled_reviews (1).csv


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
from gensim.models import Doc2Vec

# Initialize the Doc2Vec model
model = Doc2Vec(vector_size=100, window=5, min_count=2, workers=4, epochs=20)

# Build the vocabulary from the tagged documents
model.build_vocab(tagged_data)

# Train the model
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)


In [15]:
# Extract document vectors
doc_vectors = [model.dv[str(i)] for i in range(len(tagged_data))]


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(doc_vectors, df['labels'], test_size=0.2, random_state=42)

# Train a logistic regression classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Make predictions
y_pred = classifier.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Get precision, recall, F1-score, and support
print("Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=['Non-Functional', 'Functional']))


Model Accuracy: 74.27%
Classification Report:

                precision    recall  f1-score   support

Non-Functional       0.74      0.83      0.78      1402
    Functional       0.74      0.64      0.68      1097

      accuracy                           0.74      2499
     macro avg       0.74      0.73      0.73      2499
  weighted avg       0.74      0.74      0.74      2499



In [22]:
# Step 1: Train and Save the Doc2Vec Model
model.save('doc2vec_model')

# Step 2: Install huggingface_hub
!pip install huggingface_hub

# Step 3: Login to Hugging Face
from huggingface_hub import notebook_login
notebook_login()

# Step 4: Upload the model to Hugging Face
from huggingface_hub import HfApi

# Initialize Hugging Face API
api = HfApi()

# Upload the model to your Hugging Face repository
api.upload_folder(
    folder_path='./',  # Folder path where 'doc2vec_model' is located
    repo_id='RafidMehda/doc2vec_model',  # Your Hugging Face repository name
    repo_type='model'  # Specify that it's a model repository
)


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


doc2vec_model:   0%|          | 0.00/10.6M [00:00<?, ?B/s]

mnist_test.csv:   0%|          | 0.00/18.3M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

mnist_train_small.csv:   0%|          | 0.00/36.5M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/RafidMehda/doc2vec_model/commit/5ff2f7c8b9bb77f44006627b023d55bd9f22879a', commit_message='Upload folder using huggingface_hub', commit_description='', oid='5ff2f7c8b9bb77f44006627b023d55bd9f22879a', pr_url=None, pr_revision=None, pr_num=None)