In [None]:
# Bert base uncased

# Import necessary libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import time

# Define the list of aspects
aspects = [
    'Usability', 'Performance', 'Bug', 'Security', 'Community',
    'Compatibility', 'Documentation', 'Legal', 'Portability',
    'OnlySentiment', 'Others'
]

# Define the attention module using PyTorch's scaled_dot_product_attention
class ScaledDotProductAttentionModule(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(ScaledDotProductAttentionModule, self).__init__()
        self.num_heads = num_heads
        self.embed_dim = embed_dim
        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
        self.qkv_proj = nn.Linear(embed_dim, embed_dim * 3)
        self.out_proj = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        batch_size, seq_length, _ = x.size()
        qkv = self.qkv_proj(x)
        qkv = qkv.view(batch_size, seq_length, 3, self.num_heads, self.head_dim)
        qkv = qkv.permute(2, 0, 3, 1, 4)
        q, k, v = qkv.unbind(0)

        attn_output = F.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.1, is_causal=False)
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_length, self.embed_dim)
        output = self.out_proj(attn_output)
        return output

# Define the model
class CustomSequenceClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, num_labels=2):
        super(CustomSequenceClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.attention = ScaledDotProductAttentionModule(embedding_dim, num_heads)
        self.classifier = nn.Linear(embedding_dim, num_labels)

    def forward(self, input_ids):
        x = self.embedding(input_ids)
        attention_output = self.attention(x)
        pooled_output = attention_output.mean(dim=1)
        logits = self.classifier(pooled_output)
        return logits

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# File path to your dataset (update this path)
file_path = '/content/BenchmarkUddinSO-ConsoliatedAspectSentiment.xls'

# Load the dataset
data = pd.read_excel(file_path)
df = data[['sent', 'ManualLabel', 'codes']]

# Initialize a list to collect results
results = []

# Loop over each aspect
for aspect in aspects:
    print(f"\nProcessing aspect: {aspect}")
    # Filter the dataset for the current aspect
    df_filtered = df[df['codes'].str.contains(aspect, case=False, na=False)].copy()

    # Check if the filtered dataset is empty
    if df_filtered.empty:
        print(f"No data found for aspect: {aspect}")
        continue

    # Map labels: 'p' to 1 and others to 0
    df_filtered['ManualLabel'] = df_filtered['ManualLabel'].apply(lambda x: 1 if x == 'p' else 0)

    # Split the dataset
    train_df, test_df = train_test_split(df_filtered, test_size=0.4, random_state=42)

    # Tokenize the text data
    def tokenize_function(texts):
        return tokenizer(
            texts,
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )

    train_texts = train_df['sent'].tolist()
    train_labels = train_df['ManualLabel'].tolist()
    test_texts = test_df['sent'].tolist()
    test_labels = test_df['ManualLabel'].tolist()

    train_encodings = tokenize_function(train_texts)
    test_encodings = tokenize_function(test_texts)

    # Create a custom dataset class
    class SentimentDataset(Dataset):
        def __init__(self, encodings, labels):
            self.encodings = {key: val.clone().detach() for key, val in encodings.items()}
            self.labels = torch.tensor(labels, dtype=torch.long)

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]
            return item

        def __len__(self):
            return len(self.labels)

    # Create data loaders
    train_dataset = SentimentDataset(train_encodings, train_labels)
    test_dataset = SentimentDataset(test_encodings, test_labels)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

    # Initialize the model and optimizer
    vocab_size = tokenizer.vocab_size
    embedding_dim = 768
    num_heads = 12
    num_labels = 2

    model = CustomSequenceClassifier(
        vocab_size=vocab_size,
        embedding_dim=embedding_dim,
        num_heads=num_heads,
        num_labels=num_labels
    )

    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Training loop
    model.train()
    num_epochs = 5

    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            logits = model(input_ids)
            loss = F.cross_entropy(logits, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    # Evaluation loop
    model.eval()
    predictions, true_labels = [], []
    total_inference_time = 0
    total_samples = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            start_time = time.time()
            logits = model(input_ids)
            inference_time = time.time() - start_time

            preds = logits.argmax(dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

            total_inference_time += inference_time
            total_samples += input_ids.size(0)

    # Compute metrics
    accuracy = accuracy_score(true_labels, predictions)
    f1_micro = f1_score(true_labels, predictions, average='micro', zero_division=0)
    f1_macro = f1_score(true_labels, predictions, average='macro', zero_division=0)
    f1_weighted = f1_score(true_labels, predictions, average='weighted', zero_division=0)

    average_inference_time = total_inference_time / total_samples if total_samples > 0 else 0

    # Print results
    print(f'Aspect: {aspect}')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'F1 Score (Micro): {f1_micro:.4f}')
    print(f'F1 Score (Macro): {f1_macro:.4f}')
    print(f'F1 Score (Weighted): {f1_weighted:.4f}')
    print(f'Total Inference Time: {total_inference_time:.6f} seconds')
    print(f'Total Samples: {total_samples}')
    print(f'Average Inference Time per Sample: {average_inference_time:.6f} seconds')

    # Collect results
    results.append({
        'aspect': aspect,
        'accuracy': accuracy,
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'total_inference_time': total_inference_time,
        'total_samples': total_samples,
        'average_inference_time': average_inference_time
    })

# After processing all aspects, display the results
results_df = pd.DataFrame(results)
print("\nFinal Results:")
print(results_df)

# --- Model Score Calculation --- #

# Compute Average F1 Score (avg. F1)
avg_f1 = results_df['f1_micro'].mean()

# Compute Measured Average Runtime (measured avg runtime)
measured_avg_runtime = results_df['average_inference_time'].mean()

# Compute Maximum Average Runtime (max avg runtime)
max_avg_runtime = results_df['total_inference_time'].max()

# Compute the Model Score
model_score = (avg_f1) * 0.75 + ((max_avg_runtime - measured_avg_runtime) / max_avg_runtime) * 0.25

print(f"\nModel Score: {model_score:.4f}")





Processing aspect: Usability
Epoch 1/5, Loss: 0.6255
Epoch 2/5, Loss: 0.6136
Epoch 3/5, Loss: 0.5630
Epoch 4/5, Loss: 0.4950
Epoch 5/5, Loss: 0.3917
Aspect: Usability
Accuracy: 0.6730
F1 Score (Micro): 0.6730
F1 Score (Macro): 0.5797
F1 Score (Weighted): 0.6489
Total Inference Time: 0.040590 seconds
Total Samples: 575
Average Inference Time per Sample: 0.000071 seconds

Processing aspect: Performance
Epoch 1/5, Loss: 0.6809
Epoch 2/5, Loss: 0.6677
Epoch 3/5, Loss: 0.6543
Epoch 4/5, Loss: 0.5946
Epoch 5/5, Loss: 0.5176
Aspect: Performance
Accuracy: 0.6286
F1 Score (Micro): 0.6286
F1 Score (Macro): 0.5304
F1 Score (Weighted): 0.5918
Total Inference Time: 0.007365 seconds
Total Samples: 140
Average Inference Time per Sample: 0.000053 seconds

Processing aspect: Bug
Epoch 1/5, Loss: 0.4758
Epoch 2/5, Loss: 0.4239
Epoch 3/5, Loss: 0.4182
Epoch 4/5, Loss: 0.4146
Epoch 5/5, Loss: 0.4049
Aspect: Bug
Accuracy: 0.8289
F1 Score (Micro): 0.8289
F1 Score (Macro): 0.4532
F1 Score (Weighted): 0.7514

In [None]:
# Tiny Bert

# Import necessary libraries
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import time

# Define the list of aspects
aspects = [
    'Usability', 'Performance', 'Bug', 'Security', 'Community',
    'Compatibility', 'Documentation', 'Legal', 'Portability',
    'OnlySentiment', 'Others'
]

# Initialize the tokenizer (TinyBERT tokenizer)
tokenizer = BertTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')

# File path to your dataset (update this path)
file_path = '/content/BenchmarkUddinSO-ConsoliatedAspectSentiment.xls'

# Load the dataset
data = pd.read_excel(file_path)
df = data[['sent', 'ManualLabel', 'codes']]

# Initialize a list to collect results
results = []

# Loop over each aspect
for aspect in aspects:
    print(f"\nProcessing aspect: {aspect}")
    # Filter the dataset for the current aspect
    df_filtered = df[df['codes'].str.contains(aspect, case=False, na=False)].copy()

    # Check if the filtered dataset is empty
    if df_filtered.empty:
        print(f"No data found for aspect: {aspect}")
        continue

    # Map labels: 'p' to 1 and others to 0
    df_filtered['ManualLabel'] = df_filtered['ManualLabel'].apply(lambda x: 1 if x == 'p' else 0)

    # Split the dataset
    train_df, test_df = train_test_split(df_filtered, test_size=0.4, random_state=42)

    # Tokenize the text data
    def tokenize_function(texts):
        return tokenizer(
            texts,
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )

    train_texts = train_df['sent'].tolist()
    train_labels = train_df['ManualLabel'].tolist()
    test_texts = test_df['sent'].tolist()
    test_labels = test_df['ManualLabel'].tolist()

    train_encodings = tokenize_function(train_texts)
    test_encodings = tokenize_function(test_texts)

    # Create a custom dataset class
    class SentimentDataset(Dataset):
        def __init__(self, encodings, labels):
            self.encodings = {key: val.clone().detach() for key, val in encodings.items()}
            self.labels = torch.tensor(labels, dtype=torch.long)

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]
            return item

        def __len__(self):
            return len(self.labels)

    # Create data loaders
    train_dataset = SentimentDataset(train_encodings, train_labels)
    test_dataset = SentimentDataset(test_encodings, test_labels)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

    # Initialize the TinyBERT model for sequence classification
    model = BertForSequenceClassification.from_pretrained('huawei-noah/TinyBERT_General_4L_312D', num_labels=2)

    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Training loop
    model.train()
    num_epochs = 5

    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    # Evaluation loop
    model.eval()
    predictions, true_labels = [], []
    total_inference_time = 0
    total_samples = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            start_time = time.time()
            outputs = model(input_ids, attention_mask=attention_mask)
            inference_time = time.time() - start_time

            preds = outputs.logits.argmax(dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

            total_inference_time += inference_time
            total_samples += input_ids.size(0)

    # Compute metrics
    accuracy = accuracy_score(true_labels, predictions)
    f1_micro = f1_score(true_labels, predictions, average='micro', zero_division=0)
    f1_macro = f1_score(true_labels, predictions, average='macro', zero_division=0)
    f1_weighted = f1_score(true_labels, predictions, average='weighted', zero_division=0)

    average_inference_time = total_inference_time / total_samples if total_samples > 0 else 0

    # Print results
    print(f'Aspect: {aspect}')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'F1 Score (Micro): {f1_micro:.4f}')
    print(f'F1 Score (Macro): {f1_macro:.4f}')
    print(f'F1 Score (Weighted): {f1_weighted:.4f}')
    print(f'Total Inference Time: {total_inference_time:.6f} seconds')
    print(f'Total Samples: {total_samples}')
    print(f'Average Inference Time per Sample: {average_inference_time:.6f} seconds')

    # Collect results
    results.append({
        'aspect': aspect,
        'accuracy': accuracy,
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'total_inference_time': total_inference_time,
        'total_samples': total_samples,
        'average_inference_time': average_inference_time
    })

# After processing all aspects, display the results
results_df = pd.DataFrame(results)
print("\nFinal Results:")
print(results_df)

# --- Model Score Calculation --- #

# Compute Average F1 Score (avg. F1)
avg_f1 = results_df['f1_micro'].mean()

# Compute Measured Average Runtime (measured avg runtime)
measured_avg_runtime = results_df['average_inference_time'].mean()

# Compute Maximum Average Runtime (max avg runtime)
max_avg_runtime = results_df['total_inference_time'].max()

# Compute the Model Score
model_score = (avg_f1) * 0.75 + ((max_avg_runtime - measured_avg_runtime) / max_avg_runtime) * 0.25

print(f"\nModel Score: {model_score:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]




Processing aspect: Usability


pytorch_model.bin:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.6243
Epoch 2/5, Loss: 0.5729
Epoch 3/5, Loss: 0.5205
Epoch 4/5, Loss: 0.4701
Epoch 5/5, Loss: 0.4122
Aspect: Usability
Accuracy: 0.6017
F1 Score (Micro): 0.6017
F1 Score (Macro): 0.5947
F1 Score (Weighted): 0.6134
Total Inference Time: 0.343604 seconds
Total Samples: 575
Average Inference Time per Sample: 0.000598 seconds

Processing aspect: Performance


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.6801
Epoch 2/5, Loss: 0.6692
Epoch 3/5, Loss: 0.6508
Epoch 4/5, Loss: 0.5481
Epoch 5/5, Loss: 0.5618
Aspect: Performance
Accuracy: 0.5143
F1 Score (Micro): 0.5143
F1 Score (Macro): 0.5134
F1 Score (Weighted): 0.5194
Total Inference Time: 0.082561 seconds
Total Samples: 140
Average Inference Time per Sample: 0.000590 seconds

Processing aspect: Bug


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.5981
Epoch 2/5, Loss: 0.4572
Epoch 3/5, Loss: 0.4232
Epoch 4/5, Loss: 0.4258
Epoch 5/5, Loss: 0.5301
Aspect: Bug
Accuracy: 0.8289
F1 Score (Micro): 0.8289
F1 Score (Macro): 0.4532
F1 Score (Weighted): 0.7514
Total Inference Time: 0.046852 seconds
Total Samples: 76
Average Inference Time per Sample: 0.000616 seconds

Processing aspect: Security


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.6009
Epoch 2/5, Loss: 0.5610
Epoch 3/5, Loss: 0.4562
Epoch 4/5, Loss: 0.4483
Epoch 5/5, Loss: 0.4472
Aspect: Security
Accuracy: 0.8182
F1 Score (Micro): 0.8182
F1 Score (Macro): 0.4500
F1 Score (Weighted): 0.7364
Total Inference Time: 0.043205 seconds
Total Samples: 66
Average Inference Time per Sample: 0.000655 seconds

Processing aspect: Community


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.6620
Epoch 2/5, Loss: 0.6162
Epoch 3/5, Loss: 0.5731
Epoch 4/5, Loss: 0.5460
Epoch 5/5, Loss: 0.5125
Aspect: Community
Accuracy: 0.6842
F1 Score (Micro): 0.6842
F1 Score (Macro): 0.4747
F1 Score (Weighted): 0.5794
Total Inference Time: 0.022970 seconds
Total Samples: 38
Average Inference Time per Sample: 0.000604 seconds

Processing aspect: Compatibility


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.6907
Epoch 2/5, Loss: 0.6842
Epoch 3/5, Loss: 0.6683
Epoch 4/5, Loss: 0.6433
Epoch 5/5, Loss: 0.5664
Aspect: Compatibility
Accuracy: 0.7105
F1 Score (Micro): 0.7105
F1 Score (Macro): 0.6140
F1 Score (Weighted): 0.6851
Total Inference Time: 0.044570 seconds
Total Samples: 38
Average Inference Time per Sample: 0.001173 seconds

Processing aspect: Documentation


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.6293
Epoch 2/5, Loss: 0.5943
Epoch 3/5, Loss: 0.5747
Epoch 4/5, Loss: 0.5172
Epoch 5/5, Loss: 0.4585
Aspect: Documentation
Accuracy: 0.6373
F1 Score (Micro): 0.6373
F1 Score (Macro): 0.5163
F1 Score (Weighted): 0.6112
Total Inference Time: 0.038737 seconds
Total Samples: 102
Average Inference Time per Sample: 0.000380 seconds

Processing aspect: Legal


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.6841
Epoch 2/5, Loss: 0.6476
Epoch 3/5, Loss: 0.6304
Epoch 4/5, Loss: 0.6122
Epoch 5/5, Loss: 0.6051
Aspect: Legal
Accuracy: 0.5500
F1 Score (Micro): 0.5500
F1 Score (Macro): 0.3548
F1 Score (Weighted): 0.3903
Total Inference Time: 0.008961 seconds
Total Samples: 20
Average Inference Time per Sample: 0.000448 seconds

Processing aspect: Portability


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.6797
Epoch 2/5, Loss: 0.6598
Epoch 3/5, Loss: 0.5976
Epoch 4/5, Loss: 0.6162
Epoch 5/5, Loss: 0.6088
Aspect: Portability
Accuracy: 0.7143
F1 Score (Micro): 0.7143
F1 Score (Macro): 0.4167
F1 Score (Weighted): 0.5952
Total Inference Time: 0.014820 seconds
Total Samples: 28
Average Inference Time per Sample: 0.000529 seconds

Processing aspect: OnlySentiment


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.6759
Epoch 2/5, Loss: 0.6527
Epoch 3/5, Loss: 0.6204
Epoch 4/5, Loss: 0.5656
Epoch 5/5, Loss: 0.4348
Aspect: OnlySentiment
Accuracy: 0.6929
F1 Score (Micro): 0.6929
F1 Score (Macro): 0.6500
F1 Score (Weighted): 0.6762
Total Inference Time: 0.058072 seconds
Total Samples: 140
Average Inference Time per Sample: 0.000415 seconds

Processing aspect: Others


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.3665
Epoch 2/5, Loss: 0.3306
Epoch 3/5, Loss: 0.3285
Epoch 4/5, Loss: 0.3308
Epoch 5/5, Loss: 0.3342
Aspect: Others
Accuracy: 0.8868
F1 Score (Micro): 0.8868
F1 Score (Macro): 0.4700
F1 Score (Weighted): 0.8335
Total Inference Time: 0.263631 seconds
Total Samples: 680
Average Inference Time per Sample: 0.000388 seconds

Final Results:
           aspect  accuracy  f1_micro  f1_macro  f1_weighted  \
0       Usability  0.601739  0.601739  0.594658     0.613386   
1     Performance  0.514286  0.514286  0.513392     0.519350   
2             Bug  0.828947  0.828947  0.453237     0.751420   
3        Security  0.818182  0.818182  0.450000     0.736364   
4       Community  0.684211  0.684211  0.474654     0.579432   
5   Compatibility  0.710526  0.710526  0.614035     0.685134   
6   Documentation  0.637255  0.637255  0.516340     0.611175   
7           Legal  0.550000  0.550000  0.354839     0.390323   
8     Portability  0.714286  0.714286  0.416667     0.595238   
9   

In [None]:
# Roberta Base
# Import necessary libraries
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import time

# Define the list of aspects
aspects = [
    'Usability', 'Performance', 'Bug', 'Security', 'Community',
    'Compatibility', 'Documentation', 'Legal', 'Portability',
    'OnlySentiment', 'Others'
]

# Initialize the tokenizer (RoBERTa tokenizer)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# File path to your dataset (update this path)
file_path = '/content/BenchmarkUddinSO-ConsoliatedAspectSentiment.xls'

# Load the dataset
data = pd.read_excel(file_path)
df = data[['sent', 'ManualLabel', 'codes']]

# Initialize a list to collect results
results = []

# Loop over each aspect
for aspect in aspects:
    print(f"\nProcessing aspect: {aspect}")
    # Filter the dataset for the current aspect
    df_filtered = df[df['codes'].str.contains(aspect, case=False, na=False)].copy()

    # Check if the filtered dataset is empty
    if df_filtered.empty:
        print(f"No data found for aspect: {aspect}")
        continue

    # Map labels: 'p' to 1 and others to 0
    df_filtered['ManualLabel'] = df_filtered['ManualLabel'].apply(lambda x: 1 if x == 'p' else 0)

    # Split the dataset
    train_df, test_df = train_test_split(df_filtered, test_size=0.4, random_state=42)

    # Tokenize the text data
    def tokenize_function(texts):
        return tokenizer(
            texts,
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )

    train_texts = train_df['sent'].tolist()
    train_labels = train_df['ManualLabel'].tolist()
    test_texts = test_df['sent'].tolist()
    test_labels = test_df['ManualLabel'].tolist()

    train_encodings = tokenize_function(train_texts)
    test_encodings = tokenize_function(test_texts)

    # Create a custom dataset class
    class SentimentDataset(Dataset):
        def __init__(self, encodings, labels):
            self.encodings = {key: val.clone().detach() for key, val in encodings.items()}
            self.labels = torch.tensor(labels, dtype=torch.long)

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]
            return item

        def __len__(self):
            return len(self.labels)

    # Create data loaders
    train_dataset = SentimentDataset(train_encodings, train_labels)
    test_dataset = SentimentDataset(test_encodings, test_labels)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

    # Initialize the RoBERTa model for sequence classification
    model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Training loop
    model.train()
    num_epochs = 5

    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    # Evaluation loop
    model.eval()
    predictions, true_labels = [], []
    total_inference_time = 0
    total_samples = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            start_time = time.time()
            outputs = model(input_ids, attention_mask=attention_mask)
            inference_time = time.time() - start_time

            preds = outputs.logits.argmax(dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

            total_inference_time += inference_time
            total_samples += input_ids.size(0)

    # Compute metrics
    accuracy = accuracy_score(true_labels, predictions)
    f1_micro = f1_score(true_labels, predictions, average='micro', zero_division=0)
    f1_macro = f1_score(true_labels, predictions, average='macro', zero_division=0)
    f1_weighted = f1_score(true_labels, predictions, average='weighted', zero_division=0)

    average_inference_time = total_inference_time / total_samples if total_samples > 0 else 0

    # Print results
    print(f'Aspect: {aspect}')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'F1 Score (Micro): {f1_micro:.4f}')
    print(f'F1 Score (Macro): {f1_macro:.4f}')
    print(f'F1 Score (Weighted): {f1_weighted:.4f}')
    print(f'Total Inference Time: {total_inference_time:.6f} seconds')
    print(f'Total Samples: {total_samples}')
    print(f'Average Inference Time per Sample: {average_inference_time:.6f} seconds')

    # Collect results
    results.append({
        'aspect': aspect,
        'accuracy': accuracy,
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'total_inference_time': total_inference_time,
        'total_samples': total_samples,
        'average_inference_time': average_inference_time
    })

# After processing all aspects, display the results
results_df = pd.DataFrame(results)
print("\nFinal Results:")
print(results_df)

# --- Model Score Calculation --- #

# Compute Average F1 Score (avg. F1)
avg_f1 = results_df['f1_micro'].mean()

# Compute Measured Average Runtime (measured avg runtime)
measured_avg_runtime = results_df['average_inference_time'].mean()

# Compute Maximum Average Runtime (max avg runtime)
max_avg_runtime = results_df['total_inference_time'].max()

# Compute the Model Score
model_score = (avg_f1) * 0.75 + ((max_avg_runtime - measured_avg_runtime) / max_avg_runtime) * 0.25

print(f"\nModel Score: {model_score:.4f}")





Processing aspect: Usability


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.6419
Epoch 2/5, Loss: 0.6237
Epoch 3/5, Loss: 0.6329
Epoch 4/5, Loss: 0.6291
Epoch 5/5, Loss: 0.6265
Aspect: Usability
Accuracy: 0.6748
F1 Score (Micro): 0.6748
F1 Score (Macro): 0.4029
F1 Score (Weighted): 0.5438
Total Inference Time: 0.755263 seconds
Total Samples: 575
Average Inference Time per Sample: 0.001314 seconds

Processing aspect: Performance


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.6888
Epoch 2/5, Loss: 0.6752
Epoch 3/5, Loss: 0.6795
Epoch 4/5, Loss: 0.6827
Epoch 5/5, Loss: 0.6520
Aspect: Performance
Accuracy: 0.6429
F1 Score (Micro): 0.6429
F1 Score (Macro): 0.3913
F1 Score (Weighted): 0.5031
Total Inference Time: 0.195895 seconds
Total Samples: 140
Average Inference Time per Sample: 0.001399 seconds

Processing aspect: Bug


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.5177
Epoch 2/5, Loss: 0.5736
Epoch 3/5, Loss: 0.4417
Epoch 4/5, Loss: 0.4487
Epoch 5/5, Loss: 0.4094
Aspect: Bug
Accuracy: 0.8289
F1 Score (Micro): 0.8289
F1 Score (Macro): 0.4532
F1 Score (Weighted): 0.7514
Total Inference Time: 0.154245 seconds
Total Samples: 76
Average Inference Time per Sample: 0.002030 seconds

Processing aspect: Security


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.5378
Epoch 2/5, Loss: 0.4698
Epoch 3/5, Loss: 0.4457
Epoch 4/5, Loss: 0.4220
Epoch 5/5, Loss: 0.3392
Aspect: Security
Accuracy: 0.8182
F1 Score (Micro): 0.8182
F1 Score (Macro): 0.4500
F1 Score (Weighted): 0.7364
Total Inference Time: 0.123399 seconds
Total Samples: 66
Average Inference Time per Sample: 0.001870 seconds

Processing aspect: Community


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.6534
Epoch 2/5, Loss: 0.5565
Epoch 3/5, Loss: 0.5312
Epoch 4/5, Loss: 0.5110
Epoch 5/5, Loss: 0.2851
Aspect: Community
Accuracy: 0.6579
F1 Score (Micro): 0.6579
F1 Score (Macro): 0.4601
F1 Score (Weighted): 0.5633
Total Inference Time: 0.055398 seconds
Total Samples: 38
Average Inference Time per Sample: 0.001458 seconds

Processing aspect: Compatibility


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.7164
Epoch 2/5, Loss: 0.6856
Epoch 3/5, Loss: 0.6771
Epoch 4/5, Loss: 0.6454
Epoch 5/5, Loss: 0.5506
Aspect: Compatibility
Accuracy: 0.5526
F1 Score (Micro): 0.5526
F1 Score (Macro): 0.5369
F1 Score (Weighted): 0.5683
Total Inference Time: 0.053740 seconds
Total Samples: 38
Average Inference Time per Sample: 0.001414 seconds

Processing aspect: Documentation


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.6305
Epoch 2/5, Loss: 0.5953
Epoch 3/5, Loss: 0.5996
Epoch 4/5, Loss: 0.5892
Epoch 5/5, Loss: 0.5520
Aspect: Documentation
Accuracy: 0.5686
F1 Score (Micro): 0.5686
F1 Score (Macro): 0.5476
F1 Score (Weighted): 0.5858
Total Inference Time: 0.138121 seconds
Total Samples: 102
Average Inference Time per Sample: 0.001354 seconds

Processing aspect: Legal


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.6139
Epoch 2/5, Loss: 0.6061
Epoch 3/5, Loss: 0.5800
Epoch 4/5, Loss: 0.5268
Epoch 5/5, Loss: 0.3499
Aspect: Legal
Accuracy: 0.5000
F1 Score (Micro): 0.5000
F1 Score (Macro): 0.3333
F1 Score (Weighted): 0.3667
Total Inference Time: 0.031758 seconds
Total Samples: 20
Average Inference Time per Sample: 0.001588 seconds

Processing aspect: Portability


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.6425
Epoch 2/5, Loss: 0.6594
Epoch 3/5, Loss: 0.6114
Epoch 4/5, Loss: 0.5812
Epoch 5/5, Loss: 0.5553
Aspect: Portability
Accuracy: 0.7143
F1 Score (Micro): 0.7143
F1 Score (Macro): 0.4167
F1 Score (Weighted): 0.5952
Total Inference Time: 0.044506 seconds
Total Samples: 28
Average Inference Time per Sample: 0.001589 seconds

Processing aspect: OnlySentiment


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.6822
Epoch 2/5, Loss: 0.6838
Epoch 3/5, Loss: 0.6812
Epoch 4/5, Loss: 0.6715
Epoch 5/5, Loss: 0.6858
Aspect: OnlySentiment
Accuracy: 0.6071
F1 Score (Micro): 0.6071
F1 Score (Macro): 0.3778
F1 Score (Weighted): 0.4587
Total Inference Time: 0.209421 seconds
Total Samples: 140
Average Inference Time per Sample: 0.001496 seconds

Processing aspect: Others


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.3687
Epoch 2/5, Loss: 0.3450
Epoch 3/5, Loss: 0.3379
Epoch 4/5, Loss: 0.3346
Epoch 5/5, Loss: 0.3346
Aspect: Others
Accuracy: 0.8868
F1 Score (Micro): 0.8868
F1 Score (Macro): 0.4700
F1 Score (Weighted): 0.8335
Total Inference Time: 2.349885 seconds
Total Samples: 680
Average Inference Time per Sample: 0.003456 seconds

Final Results:
           aspect  accuracy  f1_micro  f1_macro  f1_weighted  \
0       Usability  0.674783  0.674783  0.402908     0.543750   
1     Performance  0.642857  0.642857  0.391304     0.503106   
2             Bug  0.828947  0.828947  0.453237     0.751420   
3        Security  0.818182  0.818182  0.450000     0.736364   
4       Community  0.657895  0.657895  0.460109     0.563302   
5   Compatibility  0.552632  0.552632  0.536918     0.568346   
6   Documentation  0.568627  0.568627  0.547581     0.585848   
7           Legal  0.500000  0.500000  0.333333     0.366667   
8     Portability  0.714286  0.714286  0.416667     0.595238   
9   

In [None]:
# Distill Bert Revised

# Import necessary libraries
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import time

# Define the list of aspects
aspects = [
    'Usability', 'Performance', 'Bug', 'Security', 'Community',
    'Compatibility', 'Documentation', 'Legal', 'Portability',
    'OnlySentiment', 'Others'
]

# Initialize the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# File path to your dataset (update this path)
file_path = '/content/BenchmarkUddinSO-ConsoliatedAspectSentiment.xls'  # Update this path

# Load the dataset
data = pd.read_excel(file_path)
df = data[['sent', 'ManualLabel', 'codes']]

# Initialize a list to collect results
results = []

# Loop over each aspect
for aspect in aspects:
    print(f"\nProcessing aspect: {aspect}")
    # Filter the dataset for the current aspect
    df_filtered = df[df['codes'].str.contains(aspect, case=False, na=False)].copy()

    # Check if the filtered dataset is empty
    if df_filtered.empty:
        print(f"No data found for aspect: {aspect}")
        continue

    # Map labels: 'p' to 1 and others to 0
    df_filtered['ManualLabel'] = df_filtered['ManualLabel'].apply(lambda x: 1 if x == 'p' else 0)

    # Split the dataset
    train_df, test_df = train_test_split(df_filtered, test_size=0.4, random_state=42)

    # Tokenize the text data
    def tokenize_function(texts):
        return tokenizer(
            texts,
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )

    train_texts = train_df['sent'].tolist()
    train_labels = train_df['ManualLabel'].tolist()
    test_texts = test_df['sent'].tolist()
    test_labels = test_df['ManualLabel'].tolist()

    train_encodings = tokenize_function(train_texts)
    test_encodings = tokenize_function(test_texts)

    # Create a custom dataset class
    class SentimentDataset(Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = torch.tensor(labels, dtype=torch.long)

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]
            return item

        def __len__(self):
            return len(self.labels)

    # Create data loaders
    train_dataset = SentimentDataset(train_encodings, train_labels)
    test_dataset = SentimentDataset(test_encodings, test_labels)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

    # Initialize the model and optimizer
    num_labels = 2
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Training loop
    model.train()
    num_epochs = 5

    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    # Evaluation loop
    model.eval()
    predictions, true_labels = [], []
    total_inference_time = 0
    total_samples = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            start_time = time.time()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            inference_time = time.time() - start_time

            logits = outputs.logits
            preds = logits.argmax(dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

            total_inference_time += inference_time
            total_samples += input_ids.size(0)

    # Compute metrics
    accuracy = accuracy_score(true_labels, predictions)
    f1_micro = f1_score(true_labels, predictions, average='micro', zero_division=0)
    f1_macro = f1_score(true_labels, predictions, average='macro', zero_division=0)
    f1_weighted = f1_score(true_labels, predictions, average='weighted', zero_division=0)

    average_inference_time = total_inference_time / total_samples if total_samples > 0 else 0

    # Print results
    print(f'Aspect: {aspect}')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'F1 Score (Micro): {f1_micro:.4f}')
    print(f'F1 Score (Macro): {f1_macro:.4f}')
    print(f'F1 Score (Weighted): {f1_weighted:.4f}')
    print(f'Total Inference Time: {total_inference_time:.6f} seconds')
    print(f'Total Samples: {total_samples}')
    print(f'Average Inference Time per Sample: {average_inference_time:.6f} seconds')

    # Collect results
    results.append({
        'aspect': aspect,
        'accuracy': accuracy,
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'total_inference_time': total_inference_time,
        'total_samples': total_samples,
        'average_inference_time': average_inference_time
    })

# After processing all aspects, display the results
results_df = pd.DataFrame(results)
print("\nFinal Results:")
print(results_df)

# --- Model Score Calculation --- #

# Compute Average F1 Score (avg. F1)
avg_f1 = results_df['f1_micro'].mean()

# Compute Measured Average Runtime (measured avg runtime)
measured_avg_runtime = results_df['average_inference_time'].mean()

# Compute Maximum Average Runtime (max avg runtime)
max_avg_runtime = results_df['total_inference_time'].max()

# Compute the Model Score
model_score = (avg_f1) * 0.75 + ((max_avg_runtime - measured_avg_runtime) / max_avg_runtime) * 0.25

print(f"\nModel Score: {model_score:.4f}")



Processing aspect: Usability


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized:

Epoch 1/5, Loss: 0.6074
Epoch 2/5, Loss: 0.4859
Epoch 3/5, Loss: 0.2657
Epoch 4/5, Loss: 0.1184
Epoch 5/5, Loss: 0.0752
Aspect: Usability
Accuracy: 0.6817
F1 Score (Micro): 0.6817
F1 Score (Macro): 0.6445
F1 Score (Weighted): 0.6847
Total Inference Time: 0.438476 seconds
Total Samples: 575
Average Inference Time per Sample: 0.000763 seconds

Processing aspect: Performance


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized:

Epoch 1/5, Loss: 0.6868
Epoch 2/5, Loss: 0.6685
Epoch 3/5, Loss: 0.6272
Epoch 4/5, Loss: 0.4051
Epoch 5/5, Loss: 0.1681
Aspect: Performance
Accuracy: 0.6357
F1 Score (Micro): 0.6357
F1 Score (Macro): 0.6015
F1 Score (Weighted): 0.6349
Total Inference Time: 0.106564 seconds
Total Samples: 140
Average Inference Time per Sample: 0.000761 seconds

Processing aspect: Bug


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized:

Epoch 1/5, Loss: 0.4883
Epoch 2/5, Loss: 0.4041
Epoch 3/5, Loss: 0.3638
Epoch 4/5, Loss: 0.1812
Epoch 5/5, Loss: 0.0915
Aspect: Bug
Accuracy: 0.7500
F1 Score (Micro): 0.7500
F1 Score (Macro): 0.5952
F1 Score (Weighted): 0.7599
Total Inference Time: 0.066012 seconds
Total Samples: 76
Average Inference Time per Sample: 0.000869 seconds

Processing aspect: Security


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized:

Epoch 1/5, Loss: 0.5391
Epoch 2/5, Loss: 0.5687
Epoch 3/5, Loss: 0.4639
Epoch 4/5, Loss: 0.4250
Epoch 5/5, Loss: 0.4299
Aspect: Security
Accuracy: 0.8182
F1 Score (Micro): 0.8182
F1 Score (Macro): 0.4500
F1 Score (Weighted): 0.7364
Total Inference Time: 0.087425 seconds
Total Samples: 66
Average Inference Time per Sample: 0.001325 seconds

Processing aspect: Community


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized:

Epoch 1/5, Loss: 0.6131
Epoch 2/5, Loss: 0.5328
Epoch 3/5, Loss: 0.3902
Epoch 4/5, Loss: 0.2240
Epoch 5/5, Loss: 0.0637
Aspect: Community
Accuracy: 0.6316
F1 Score (Micro): 0.6316
F1 Score (Macro): 0.3871
F1 Score (Weighted): 0.5093
Total Inference Time: 0.033148 seconds
Total Samples: 38
Average Inference Time per Sample: 0.000872 seconds

Processing aspect: Compatibility


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized:

Epoch 1/5, Loss: 0.7169
Epoch 2/5, Loss: 0.6667
Epoch 3/5, Loss: 0.6253
Epoch 4/5, Loss: 0.4948
Epoch 5/5, Loss: 0.2248
Aspect: Compatibility
Accuracy: 0.7105
F1 Score (Micro): 0.7105
F1 Score (Macro): 0.6140
F1 Score (Weighted): 0.6851
Total Inference Time: 0.028947 seconds
Total Samples: 38
Average Inference Time per Sample: 0.000762 seconds

Processing aspect: Documentation


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized:

Epoch 1/5, Loss: 0.6138
Epoch 2/5, Loss: 0.4977
Epoch 3/5, Loss: 0.3266
Epoch 4/5, Loss: 0.1918
Epoch 5/5, Loss: 0.0497
Aspect: Documentation
Accuracy: 0.6961
F1 Score (Micro): 0.6961
F1 Score (Macro): 0.5837
F1 Score (Weighted): 0.6685
Total Inference Time: 0.079418 seconds
Total Samples: 102
Average Inference Time per Sample: 0.000779 seconds

Processing aspect: Legal


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized:

Epoch 1/5, Loss: 0.6536
Epoch 2/5, Loss: 0.5440
Epoch 3/5, Loss: 0.4912
Epoch 4/5, Loss: 0.3582
Epoch 5/5, Loss: 0.2608
Aspect: Legal
Accuracy: 0.7500
F1 Score (Micro): 0.7500
F1 Score (Macro): 0.7333
F1 Score (Weighted): 0.7400
Total Inference Time: 0.017603 seconds
Total Samples: 20
Average Inference Time per Sample: 0.000880 seconds

Processing aspect: Portability


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized:

Epoch 1/5, Loss: 0.6690
Epoch 2/5, Loss: 0.5870
Epoch 3/5, Loss: 0.4963
Epoch 4/5, Loss: 0.4135
Epoch 5/5, Loss: 0.2499
Aspect: Portability
Accuracy: 0.7143
F1 Score (Micro): 0.7143
F1 Score (Macro): 0.5758
F1 Score (Weighted): 0.6797
Total Inference Time: 0.038813 seconds
Total Samples: 28
Average Inference Time per Sample: 0.001386 seconds

Processing aspect: OnlySentiment


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized:

Epoch 1/5, Loss: 0.6853
Epoch 2/5, Loss: 0.6203
Epoch 3/5, Loss: 0.4609
Epoch 4/5, Loss: 0.2601
Epoch 5/5, Loss: 0.1276
Aspect: OnlySentiment
Accuracy: 0.6714
F1 Score (Micro): 0.6714
F1 Score (Macro): 0.6453
F1 Score (Weighted): 0.6659
Total Inference Time: 0.132648 seconds
Total Samples: 140
Average Inference Time per Sample: 0.000947 seconds

Processing aspect: Others


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized:

Epoch 1/5, Loss: 0.3461
Epoch 2/5, Loss: 0.3281
Epoch 3/5, Loss: 0.3164
Epoch 4/5, Loss: 0.2846
Epoch 5/5, Loss: 0.1792
Aspect: Others
Accuracy: 0.8647
F1 Score (Micro): 0.8647
F1 Score (Macro): 0.5125
F1 Score (Weighted): 0.8330
Total Inference Time: 0.561603 seconds
Total Samples: 680
Average Inference Time per Sample: 0.000826 seconds

Final Results:
           aspect  accuracy  f1_micro  f1_macro  f1_weighted  \
0       Usability  0.681739  0.681739  0.644545     0.684739   
1     Performance  0.635714  0.635714  0.601540     0.634881   
2             Bug  0.750000  0.750000  0.595178     0.759882   
3        Security  0.818182  0.818182  0.450000     0.736364   
4       Community  0.631579  0.631579  0.387097     0.509338   
5   Compatibility  0.710526  0.710526  0.614035     0.685134   
6   Documentation  0.696078  0.696078  0.583673     0.668507   
7           Legal  0.750000  0.750000  0.733333     0.740000   
8     Portability  0.714286  0.714286  0.575758     0.679654   
9   

In [None]:
# Roberta Large
# !pip install pandas numpy scikit-learn torch torchvision torchaudio transformers

# Import necessary libraries
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import time

# Define the list of aspects
aspects = [
    'Usability', 'Performance', 'Bug', 'Security', 'Community',
    'Compatibility', 'Documentation', 'Legal', 'Portability',
    'OnlySentiment', 'Others'
]

# Initialize the tokenizer (RoBERTa-large tokenizer)
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

# File path to your dataset (update this path)
file_path = '/content/BenchmarkUddinSO-ConsoliatedAspectSentiment.xls'

# Load the dataset
data = pd.read_excel(file_path)
df = data[['sent', 'ManualLabel', 'codes']]

# Initialize a list to collect results
results = []

# Loop over each aspect
for aspect in aspects:
    print(f"\nProcessing aspect: {aspect}")
    # Filter the dataset for the current aspect
    df_filtered = df[df['codes'].str.contains(aspect, case=False, na=False)].copy()

    # Check if the filtered dataset is empty
    if df_filtered.empty:
        print(f"No data found for aspect: {aspect}")
        continue

    # Map labels: 'p' to 1 and others to 0
    df_filtered['ManualLabel'] = df_filtered['ManualLabel'].apply(lambda x: 1 if x == 'p' else 0)

    # Split the dataset
    train_df, test_df = train_test_split(df_filtered, test_size=0.4, random_state=42)

    # Tokenize the text data
    def tokenize_function(texts):
        return tokenizer(
            texts,
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )

    train_texts = train_df['sent'].tolist()
    train_labels = train_df['ManualLabel'].tolist()
    test_texts = test_df['sent'].tolist()
    test_labels = test_df['ManualLabel'].tolist()

    train_encodings = tokenize_function(train_texts)
    test_encodings = tokenize_function(test_texts)

    # Create a custom dataset class
    class SentimentDataset(Dataset):
        def __init__(self, encodings, labels):
            self.encodings = {key: val.clone().detach() for key, val in encodings.items()}
            self.labels = torch.tensor(labels, dtype=torch.long)

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]
            return item

        def __len__(self):
            return len(self.labels)

    # Create data loaders
    train_dataset = SentimentDataset(train_encodings, train_labels)
    test_dataset = SentimentDataset(test_encodings, test_labels)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

    # Initialize the RoBERTa-large model for sequence classification
    model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=2)

    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Training loop
    model.train()
    num_epochs = 5

    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    # Evaluation loop
    model.eval()
    predictions, true_labels = [], []
    total_inference_time = 0
    total_samples = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            start_time = time.time()
            outputs = model(input_ids, attention_mask=attention_mask)
            inference_time = time.time() - start_time

            preds = outputs.logits.argmax(dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

            total_inference_time += inference_time
            total_samples += input_ids.size(0)

    # Compute metrics
    accuracy = accuracy_score(true_labels, predictions)
    f1_micro = f1_score(true_labels, predictions, average='micro', zero_division=0)
    f1_macro = f1_score(true_labels, predictions, average='macro', zero_division=0)
    f1_weighted = f1_score(true_labels, predictions, average='weighted', zero_division=0)

    average_inference_time = total_inference_time / total_samples if total_samples > 0 else 0

    # Print results
    print(f'Aspect: {aspect}')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'F1 Score (Micro): {f1_micro:.4f}')
    print(f'F1 Score (Macro): {f1_macro:.4f}')
    print(f'F1 Score (Weighted): {f1_weighted:.4f}')
    print(f'Total Inference Time: {total_inference_time:.6f} seconds')
    print(f'Total Samples: {total_samples}')
    print(f'Average Inference Time per Sample: {average_inference_time:.6f} seconds')

    # Collect results
    results.append({
        'aspect': aspect,
        'accuracy': accuracy,
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'total_inference_time': total_inference_time,
        'total_samples': total_samples,
        'average_inference_time': average_inference_time
    })

# After processing all aspects, display the results
results_df = pd.DataFrame(results)
print("\nFinal Results:")
print(results_df)

# --- Model Score Calculation --- #

# Compute Average F1 Score (avg. F1)
avg_f1 = results_df['f1_micro'].mean()

# Compute Measured Average Runtime (measured avg runtime)
measured_avg_runtime = results_df['average_inference_time'].mean()

# Compute Maximum Average Runtime (max avg runtime)
max_avg_runtime = results_df['total_inference_time'].max()

# Compute the Model Score
model_score = (avg_f1) * 0.75 + ((max_avg_runtime - measured_avg_runtime) / max_avg_runtime) * 0.25

print(f"\nModel Score: {model_score:.4f}")




vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]


Processing aspect: Usability




pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and ar

Epoch 1/5, Loss: 0.6652
Epoch 2/5, Loss: 0.6526
Epoch 3/5, Loss: 0.6383
Epoch 4/5, Loss: 0.6391
Epoch 5/5, Loss: 0.6460
Aspect: Usability
Accuracy: 0.6748
F1 Score (Micro): 0.6748
F1 Score (Macro): 0.4029
F1 Score (Weighted): 0.5438
Total Inference Time: 1.722734 seconds
Total Samples: 575
Average Inference Time per Sample: 0.002996 seconds

Processing aspect: Performance


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and ar

Epoch 1/5, Loss: 0.6804
Epoch 2/5, Loss: 0.6604
Epoch 3/5, Loss: 0.6798
Epoch 4/5, Loss: 0.6842
Epoch 5/5, Loss: 0.6789
Aspect: Performance
Accuracy: 0.6429
F1 Score (Micro): 0.6429
F1 Score (Macro): 0.3913
F1 Score (Weighted): 0.5031
Total Inference Time: 0.425252 seconds
Total Samples: 140
Average Inference Time per Sample: 0.003038 seconds

Processing aspect: Bug


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and ar

Epoch 1/5, Loss: 0.6781
Epoch 2/5, Loss: 0.4800
Epoch 3/5, Loss: 0.4208
Epoch 4/5, Loss: 0.4460
Epoch 5/5, Loss: 0.4628
Aspect: Bug
Accuracy: 0.8289
F1 Score (Micro): 0.8289
F1 Score (Macro): 0.4532
F1 Score (Weighted): 0.7514
Total Inference Time: 0.322428 seconds
Total Samples: 76
Average Inference Time per Sample: 0.004242 seconds

Processing aspect: Security


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and ar

Epoch 1/5, Loss: 0.4638
Epoch 2/5, Loss: 0.5183
Epoch 3/5, Loss: 0.4536
Epoch 4/5, Loss: 0.4697
Epoch 5/5, Loss: 0.4517
Aspect: Security
Accuracy: 0.8182
F1 Score (Micro): 0.8182
F1 Score (Macro): 0.4500
F1 Score (Weighted): 0.7364
Total Inference Time: 0.203732 seconds
Total Samples: 66
Average Inference Time per Sample: 0.003087 seconds

Processing aspect: Community


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and ar

Epoch 1/5, Loss: 0.7262
Epoch 2/5, Loss: 0.5663
Epoch 3/5, Loss: 0.5696
Epoch 4/5, Loss: 0.6128
Epoch 5/5, Loss: 0.5631
Aspect: Community
Accuracy: 0.6579
F1 Score (Micro): 0.6579
F1 Score (Macro): 0.3968
F1 Score (Weighted): 0.5221
Total Inference Time: 0.157927 seconds
Total Samples: 38
Average Inference Time per Sample: 0.004156 seconds

Processing aspect: Compatibility


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and ar

Epoch 1/5, Loss: 0.7882
Epoch 2/5, Loss: 0.7111
Epoch 3/5, Loss: 0.7100
Epoch 4/5, Loss: 0.6963
Epoch 5/5, Loss: 0.6895
Aspect: Compatibility
Accuracy: 0.6842
F1 Score (Micro): 0.6842
F1 Score (Macro): 0.4062
F1 Score (Weighted): 0.5559
Total Inference Time: 0.108717 seconds
Total Samples: 38
Average Inference Time per Sample: 0.002861 seconds

Processing aspect: Documentation


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and ar

Epoch 1/5, Loss: 0.6379
Epoch 2/5, Loss: 0.6252
Epoch 3/5, Loss: 0.5980
Epoch 4/5, Loss: 0.6200
Epoch 5/5, Loss: 0.6287
Aspect: Documentation
Accuracy: 0.6961
F1 Score (Micro): 0.6961
F1 Score (Macro): 0.4104
F1 Score (Weighted): 0.5713
Total Inference Time: 0.292930 seconds
Total Samples: 102
Average Inference Time per Sample: 0.002872 seconds

Processing aspect: Legal


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and ar

Epoch 1/5, Loss: 0.7337
Epoch 2/5, Loss: 0.7447
Epoch 3/5, Loss: 0.6899
Epoch 4/5, Loss: 0.5719
Epoch 5/5, Loss: 0.5307
Aspect: Legal
Accuracy: 0.5500
F1 Score (Micro): 0.5500
F1 Score (Macro): 0.3548
F1 Score (Weighted): 0.3903
Total Inference Time: 0.067950 seconds
Total Samples: 20
Average Inference Time per Sample: 0.003397 seconds

Processing aspect: Portability


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and ar

Epoch 1/5, Loss: 0.6269
Epoch 2/5, Loss: 0.6225
Epoch 3/5, Loss: 0.7693
Epoch 4/5, Loss: 0.6974
Epoch 5/5, Loss: 0.6253
Aspect: Portability
Accuracy: 0.7143
F1 Score (Micro): 0.7143
F1 Score (Macro): 0.4167
F1 Score (Weighted): 0.5952
Total Inference Time: 0.087677 seconds
Total Samples: 28
Average Inference Time per Sample: 0.003131 seconds

Processing aspect: OnlySentiment


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and ar

Epoch 1/5, Loss: 0.7132
Epoch 2/5, Loss: 0.6961
Epoch 3/5, Loss: 0.6837
Epoch 4/5, Loss: 0.6866
Epoch 5/5, Loss: 0.6664
Aspect: OnlySentiment
Accuracy: 0.6071
F1 Score (Micro): 0.6071
F1 Score (Macro): 0.3778
F1 Score (Weighted): 0.4587
Total Inference Time: 0.410434 seconds
Total Samples: 140
Average Inference Time per Sample: 0.002932 seconds

Processing aspect: Others


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and ar

Epoch 1/5, Loss: 0.3701
Epoch 2/5, Loss: 0.3375
Epoch 3/5, Loss: 0.3397
Epoch 4/5, Loss: 0.3482
Epoch 5/5, Loss: 0.3411
Aspect: Others
Accuracy: 0.8868
F1 Score (Micro): 0.8868
F1 Score (Macro): 0.4700
F1 Score (Weighted): 0.8335
Total Inference Time: 2.025745 seconds
Total Samples: 680
Average Inference Time per Sample: 0.002979 seconds

Final Results:
           aspect  accuracy  f1_micro  f1_macro  f1_weighted  \
0       Usability  0.674783  0.674783  0.402908     0.543750   
1     Performance  0.642857  0.642857  0.391304     0.503106   
2             Bug  0.828947  0.828947  0.453237     0.751420   
3        Security  0.818182  0.818182  0.450000     0.736364   
4       Community  0.657895  0.657895  0.396825     0.522139   
5   Compatibility  0.684211  0.684211  0.406250     0.555921   
6   Documentation  0.696078  0.696078  0.410405     0.571348   
7           Legal  0.550000  0.550000  0.354839     0.390323   
8     Portability  0.714286  0.714286  0.416667     0.595238   
9   