In [1]:
# Step 1: Install FastText library
!pip install fasttext scikit-learn

# Step 2: Import necessary libraries
import pandas as pd
import nltk
import fasttext
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

nltk.download('punkt')

# Step 3: Upload the dataset
from google.colab import files
uploaded = files.upload()

# Step 4: Load the dataset
df = pd.read_csv(next(iter(uploaded)))

# Step 5: Preprocess the data (tokenize the text)
def preprocess(text):
    return ' '.join(nltk.word_tokenize(text.lower()))

df['content'] = df['content'].apply(preprocess)

# Step 6: Map labels and prepare data in FastText format
df['fasttext_label'] = '__label__' + df['RequirementType'].astype(str)

# Step 7: Split the dataset into training, validation, and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, valid_df = train_test_split(train_df, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# Step 8: Save the splits to text files in FastText format
train_df[['fasttext_label', 'content']].to_csv('train_data.txt', index=False, sep=' ', header=False)
valid_df[['fasttext_label', 'content']].to_csv('valid_data.txt', index=False, sep=' ', header=False)
test_df[['fasttext_label', 'content']].to_csv('test_data.txt', index=False, sep=' ', header=False)

# Step 9: Train the FastText model using the training data
model = fasttext.train_supervised(input="train_data.txt", epoch=25, lr=1.0, wordNgrams=2)

# Step 10: Helper function to convert FastText predictions for sklearn
def get_fasttext_predictions(model, df, label_prefix='__label__'):
    true_labels = df['RequirementType'].values  # Get the true labels (e.g., 'F', 'NF')
    predictions = []
    for text in df['content']:
        pred_label = model.predict(text)[0][0]  # FastText returns a tuple, we get the first label
        predictions.append(pred_label.replace(label_prefix, ''))  # Remove FastText label prefix
    return true_labels, predictions

# Step 11: Evaluate the model on the training set
y_train_true, y_train_pred = get_fasttext_predictions(model, train_df)
print("Classification Report for Training Set:\n")
print(classification_report(y_train_true, y_train_pred, target_names=['NF', 'F'], digits=4))

# Step 12: Evaluate the model on the validation set
y_val_true, y_val_pred = get_fasttext_predictions(model, valid_df)
print("Classification Report for Validation Set:\n")
print(classification_report(y_val_true, y_val_pred, target_names=['NF', 'F'], digits=4))

# Step 13: Evaluate the model on the test set
y_test_true, y_test_pred = get_fasttext_predictions(model, test_df)
print("Classification Report for Test Set:\n")
print(classification_report(y_test_true, y_test_pred, target_names=['NF', 'F'], digits=4))


Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━[0m [32m51.2/73.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m977.6 kB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4296185 sha256=233

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Saving final_corrected_fine_labeled_reviews.csv to final_corrected_fine_labeled_reviews.csv
Classification Report for Training Set:

              precision    recall  f1-score   support

          NF     0.9985    0.9991    0.9988      3316
           F     0.9993    0.9988    0.9990      4181

    accuracy                         0.9989      7497
   macro avg     0.9989    0.9989    0.9989      7497
weighted avg     0.9989    0.9989    0.9989      7497

Classification Report for Validation Set:

              precision    recall  f1-score   support

          NF     0.9162    0.9693    0.9420      1139
           F     0.9730    0.9257    0.9488      1360

    accuracy                         0.9456      2499
   macro avg     0.9446    0.9475    0.9454      2499
weighted avg     0.9471    0.9456    0.9457      2499

Classification Report for Test Set:

              precision    recall  f1-score   support

          NF     0.8978    0.9772    0.9358      1097
           F     0.9808 