In [1]:
# Install necessary libraries
!pip install datasets transformers scikit-learn

import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from google.colab import files
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, DistilBertConfig
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import numpy as np

# Upload the CSV file
uploaded = files.upload()

# Load the uploaded file into a DataFrame
df = pd.read_csv(next(iter(uploaded)))

# Map labels to integers (Functional: 1, Non-Functional: 0)
label_mapping = {'F': 1, 'NF': 0}
df['labels'] = df['RequirementType'].map(label_mapping)

# Split the dataset into train and test (Adjusting to a more challenging split)
train_df, test_df = train_test_split(df[['content', 'labels']], test_size=0.3, random_state=42)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load pre-trained DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['content'], padding="max_length", truncation=True)

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Modify dropout in the DistilBERT configuration
config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=2, dropout=0.3, attention_dropout=0.3)

# Load DistilBERT model with modified configuration for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)

# Define training arguments (Reducing epochs and lowering the learning rate)
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=1e-5,  # Reduced learning rate
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,  # Reducing epochs to avoid overfitting
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Fine-tune the model
trainer.train()

# Step 1: Get predictions from the model on the test set
predictions = trainer.predict(test_dataset)

# Step 2: Convert logits to predicted class
preds = np.argmax(predictions.predictions, axis=-1)

# Step 3: Calculate accuracy
accuracy = accuracy_score(test_df['labels'], preds)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Step 4: Calculate precision, recall, and F1-score
precision = precision_score(test_df['labels'], preds, average='weighted')
recall = recall_score(test_df['labels'], preds, average='weighted')
f1 = f1_score(test_df['labels'], preds, average='weighted')

# Print Precision, Recall, F1-score
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

# Step 5: Optional - Print full classification report for more detailed metrics
print("Classification Report:\n")
print(classification_report(test_df['labels'], preds, target_names=['Non-Functional', 'Functional'], zero_division=1))


Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K  

Saving final_corrected_fine_labeled_reviews.csv to final_corrected_fine_labeled_reviews.csv


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



Map:   0%|          | 0/8746 [00:00<?, ? examples/s]

Map:   0%|          | 0/3749 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.2955,0.272736
2,0.1589,0.277887


Model Accuracy: 93.17%
Precision: 0.94
Recall: 0.93
F1-Score: 0.93
Classification Report:

                precision    recall  f1-score   support

Non-Functional       0.99      0.88      0.93      2081
    Functional       0.87      0.99      0.93      1668

      accuracy                           0.93      3749
     macro avg       0.93      0.94      0.93      3749
  weighted avg       0.94      0.93      0.93      3749



In [3]:
# Save the fine-tuned model
trainer.save_model('./fined-distilBERT')

# Save the tokenizer files
tokenizer.save_pretrained('./fined-distilBERT')

!pip install huggingface_hub

from huggingface_hub import notebook_login

notebook_login()

from huggingface_hub import HfApi

# Upload the entire directory to Hugging Face
api = HfApi()
api.upload_folder(
    folder_path='./fined-distilBERT',  # Path to the folder with the model and tokenizer files
    repo_id='RafidMehda/fined-distilBERT',  # Your model repository on Hugging Face
    repo_type='model'
)



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/RafidMehda/fined-distilBERT/commit/adde1e2af4bf4ec5af50b5423964587beca48d6e', commit_message='Upload folder using huggingface_hub', commit_description='', oid='adde1e2af4bf4ec5af50b5423964587beca48d6e', pr_url=None, pr_revision=None, pr_num=None)