In [1]:
# Install necessary libraries
!pip install datasets transformers scikit-learn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from google.colab import files
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, DistilBertConfig
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Upload the CSV file
uploaded = files.upload()

# Load the uploaded file into a DataFrame
df = pd.read_csv(next(iter(uploaded)))

# Map labels to integers (Functional: 1, Non-Functional: 0)
label_mapping = {'F': 1, 'NF': 0}
df['labels'] = df['RequirementType'].map(label_mapping)

# Split the dataset into training, validation, and testing sets (60-20-20 split)
train_df, temp_df = train_test_split(df[['content', 'labels']], test_size=0.4, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Convert DataFrames to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Load pre-trained DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['content'], padding="max_length", truncation=True)

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Modify dropout in the DistilBERT configuration
config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=2, dropout=0.3, attention_dropout=0.3)

# Load DistilBERT model with modified configuration for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Fine-tune the model
trainer.train()

# Evaluation function for custom outputs
def evaluate_and_print_results(dataset, dataset_name):
    predictions = trainer.predict(dataset)
    preds = np.argmax(predictions.predictions, axis=-1)
    labels = dataset['labels']

    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')

    print(f"{dataset_name} Set Classification Report:\n")
    print(classification_report(labels, preds, target_names=['Non-Functional', 'Functional'], zero_division=1))
    print(f"{dataset_name} Set Accuracy: {accuracy * 100:.2f}%\n")

# Evaluate on Training Set
evaluate_and_print_results(train_dataset, "Training")

# Evaluate on Validation Set
evaluate_and_print_results(val_dataset, "Validation")

# Evaluate on Test Set
evaluate_and_print_results(test_dataset, "Test")


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m471.6/471.6 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

Saving balanced_large_dataset.csv to balanced_large_dataset.csv


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



Map:   0%|          | 0/1827 [00:00<?, ? examples/s]

Map:   0%|          | 0/609 [00:00<?, ? examples/s]

Map:   0%|          | 0/609 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.106645
2,No log,0.077957


Training Set Classification Report:

                precision    recall  f1-score   support

Non-Functional       0.99      0.95      0.97       897
    Functional       0.95      0.99      0.97       930

      accuracy                           0.97      1827
     macro avg       0.97      0.97      0.97      1827
  weighted avg       0.97      0.97      0.97      1827

Training Set Accuracy: 96.77%



Validation Set Classification Report:

                precision    recall  f1-score   support

Non-Functional       1.00      0.96      0.98       293
    Functional       0.96      1.00      0.98       316

      accuracy                           0.98       609
     macro avg       0.98      0.98      0.98       609
  weighted avg       0.98      0.98      0.98       609

Validation Set Accuracy: 97.87%



Test Set Classification Report:

                precision    recall  f1-score   support

Non-Functional       0.99      0.93      0.96       305
    Functional       0.94      0.99      0.96       304

      accuracy                           0.96       609
     macro avg       0.97      0.96      0.96       609
  weighted avg       0.97      0.96      0.96       609

Test Set Accuracy: 96.39%



In [3]:
# Save the fine-tuned model
trainer.save_model('./paper_fined-distilBERT')

# Save the tokenizer files
tokenizer.save_pretrained('./paper_fined-distilBERT')

!pip install huggingface_hub

from huggingface_hub import notebook_login

notebook_login()

from huggingface_hub import HfApi

# Upload the entire directory to Hugging Face
api = HfApi()
api.upload_folder(
    folder_path='./paper_fined-distilBERT',  # Path to the folder with the model and tokenizer files
    repo_id='RafidMehda/paper_fined-distilBERT',  # Your model repository on Hugging Face
    repo_type='model'
)



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/RafidMehda/paper_fined-distilBERT/commit/0b4c7bca094e201dd0dec5ba0a5532aafcc8e85a', commit_message='Upload folder using huggingface_hub', commit_description='', oid='0b4c7bca094e201dd0dec5ba0a5532aafcc8e85a', pr_url=None, pr_revision=None, pr_num=None)