In [1]:
# Step 1: Install required libraries
!pip install fasttext transformers torch scikit-learn

import pandas as pd
from google.colab import files

# Step 2: Upload the CSV file
uploaded = files.upload()

# Step 3: Load the dataset into a DataFrame
df = pd.read_csv(next(iter(uploaded)))  # Assumes the first uploaded file is your dataset

# Map 'RequirementType' to 'labels' (Functional: 1, Non-Functional: 0)
label_mapping = {'F': 1, 'NF': 0}
df['labels'] = df['RequirementType'].map(label_mapping)

# Step 4: Clean text to remove newlines and extra spaces
def clean_text(text):
    # Remove any newline characters
    text = text.replace("\n", " ")
    # Optionally remove excess whitespace
    return ' '.join(text.split())

# Apply cleaning function to the 'content' column
df['cleaned_content'] = df['content'].apply(clean_text)

# Check if the 'labels' column and cleaned content are created correctly
print(df[['RequirementType', 'labels', 'cleaned_content']].head())

# Step 5: Download the FastText model from Hugging Face
from huggingface_hub import hf_hub_download
model_path = hf_hub_download(repo_id="RafidMehda/fasttext_model", filename="fasttext_model.bin")

# Step 6: Load the fine-tuned FastText model
import fasttext
fasttext_model = fasttext.load_model(model_path)

# Step 7: Generate FastText embeddings for the cleaned dataset
def get_fasttext_embeddings(text):
    return fasttext_model.get_sentence_vector(text)

fasttext_embeddings = [get_fasttext_embeddings(doc) for doc in df['cleaned_content']]

# Step 8: Now merging this with DistilBERT
from transformers import DistilBertTokenizer, DistilBertModel
import torch

# Load DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

def get_distilbert_embeddings(text):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Get DistilBERT embeddings
    with torch.no_grad():
        outputs = distilbert_model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        pooled_embedding = torch.mean(last_hidden_state, dim=1)  # Average pooling of token embeddings
    return pooled_embedding.squeeze().numpy()

# Generate DistilBERT embeddings for the dataset
distilbert_embeddings = [get_distilbert_embeddings(doc) for doc in df['cleaned_content']]

# Step 9: Combine FastText and DistilBERT embeddings (concatenate)
import numpy as np
combined_embeddings = [np.concatenate((fasttext_emb, distilbert_emb)) for fasttext_emb, distilbert_emb in zip(fasttext_embeddings, distilbert_embeddings)]

# Step 10: Proceed with classification using the combined embeddings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Step 11: Split data into training (60%), validation (20%), and testing (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(combined_embeddings, df['labels'], test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 50% of 40% = 20%

# Train a logistic regression classifier on the training set
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Step 12: Make predictions on the training, validation, and test sets
y_train_pred = classifier.predict(X_train)
y_val_pred = classifier.predict(X_val)
y_test_pred = classifier.predict(X_test)

# Step 13: Evaluate accuracy and classification report on all sets
print("Training Set Classification Report:\n")
print(classification_report(y_train, y_train_pred, target_names=['Non-Functional', 'Functional']))

print("Validation Set Classification Report:\n")
print(classification_report(y_val, y_val_pred, target_names=['Non-Functional', 'Functional']))

print("Test Set Classification Report:\n")
print(classification_report(y_test, y_test_pred, target_names=['Non-Functional', 'Functional']))

# Optional: Print accuracies for each set
train_accuracy = accuracy_score(y_train, y_train_pred)
val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4296186 sha256=c86c2f90fe49fd70063142e7d9c8feac0ed42c8671cd3463616970685c3584ca
  Stored in directory: /root/.cache/pip/wheels/0d/a2/00/81db54d3e6a8199b829d58

Saving final_corrected_fine_labeled_reviews.csv to final_corrected_fine_labeled_reviews.csv
  RequirementType  labels                                    cleaned_content
0               F       1                      I cannot open the app anymore
1              NF       0  I have been begging for a refund from this app...
2               F       1  Very costly for the premium version (approx In...
3              NF       0  Used to keep me organized, but all the 2020 UP...
4              NF       0                                Dan Birthday Oct 28


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


fasttext_model.bin:   0%|          | 0.00/805M [00:00<?, ?B/s]

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Set Classification Report:

                precision    recall  f1-score   support

Non-Functional       0.96      0.98      0.97      4166
    Functional       0.97      0.94      0.96      3331

      accuracy                           0.96      7497
     macro avg       0.96      0.96      0.96      7497
  weighted avg       0.96      0.96      0.96      7497

Validation Set Classification Report:

                precision    recall  f1-score   support

Non-Functional       0.93      0.94      0.94      1411
    Functional       0.93      0.91      0.92      1088

      accuracy                           0.93      2499
     macro avg       0.93      0.93      0.93      2499
  weighted avg       0.93      0.93      0.93      2499

Test Set Classification Report:

                precision    recall  f1-score   support

Non-Functional       0.92      0.94      0.93      1366
    Functional       0.92      0.91      0.91      1133

      accuracy                           0.