In [3]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import time

# Define the list of aspects
aspects = [
    'Usability', 'Performance', 'Bug', 'Security', 'Community',
    'Compatibility', 'Documentation', 'Legal', 'Portability',
    'OnlySentiment', 'Others'
]

# Initialize the tokenizer and model for sentiment analysis
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load your dataset
file_path = '/content/BenchmarkUddinSO-ConsoliatedAspectSentiment.xls'  # Update the file path
data = pd.read_excel(file_path)

# Filter the dataset to retain only the necessary columns
df = data[['sent', 'ManualLabel', 'codes']]

# Initialize a list to collect results
results = []

# Function to tokenize text
def tokenize_function(texts):
    return tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )

# Loop over each aspect
for aspect in aspects:
    print(f"\nProcessing aspect: {aspect}")

    # Filter the dataset for the current aspect
    df_filtered = df[df['codes'].str.contains(aspect, case=False, na=False)].copy()

    if df_filtered.empty:
        print(f"No data found for aspect: {aspect}")
        continue

    # Map labels: 'p' to 1 (positive) and others to 0 (negative)
    df_filtered['ManualLabel'] = df_filtered['ManualLabel'].apply(lambda x: 1 if x == 'p' else 0)

    # Split the dataset into training and testing sets
    train_df, test_df = train_test_split(df_filtered, test_size=0.4, random_state=42)

    # Tokenize the text data
    train_encodings = tokenize_function(train_df['sent'].tolist())
    test_encodings = tokenize_function(test_df['sent'].tolist())

    train_labels = train_df['ManualLabel'].tolist()
    test_labels = test_df['ManualLabel'].tolist()

    # Create a custom dataset class
    class SentimentDataset(Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = torch.tensor(labels, dtype=torch.long)

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]
            return item

        def __len__(self):
            return len(self.labels)

    # Create data loaders
    train_dataset = SentimentDataset(train_encodings, train_labels)
    test_dataset = SentimentDataset(test_encodings, test_labels)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

    # Training loop
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

    num_epochs = 5
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    # Evaluation loop
    model.eval()
    predictions, true_labels = [], []
    total_inference_time = 0
    total_samples = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            start_time = time.time()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            inference_time = time.time() - start_time

            logits = outputs.logits
            preds = logits.argmax(dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

            total_inference_time += inference_time
            total_samples += input_ids.size(0)

    # Compute metrics
    accuracy = accuracy_score(true_labels, predictions)
    f1_micro = f1_score(true_labels, predictions, average='micro', zero_division=0)
    f1_macro = f1_score(true_labels, predictions, average='macro', zero_division=0)
    f1_weighted = f1_score(true_labels, predictions, average='weighted', zero_division=0)

    average_inference_time = total_inference_time / total_samples if total_samples > 0 else 0

    # Print results
    print(f'Aspect: {aspect}')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'F1 Score (Micro): {f1_micro:.4f}')
    print(f'F1 Score (Macro): {f1_macro:.4f}')
    print(f'F1 Score (Weighted): {f1_weighted:.4f}')
    print(f'Total Inference Time: {total_inference_time:.6f} seconds')
    print(f'Total Samples: {total_samples}')
    print(f'Average Inference Time per Sample: {average_inference_time:.6f} seconds')

    # Collect results
    results.append({
        'aspect': aspect,
        'accuracy': accuracy,
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'total_inference_time': total_inference_time,
        'total_samples': total_samples,
        'average_inference_time': average_inference_time
    })

# After processing all aspects, display the results
results_df = pd.DataFrame(results)
print("\nFinal Results:")
print(results_df)

# --- Model Score Calculation --- #

# Compute Average F1 Score (avg. F1)
avg_f1 = results_df['f1_micro'].mean()

# Compute Measured Average Runtime (measured avg runtime)
measured_avg_runtime = results_df['average_inference_time'].mean()

# Compute Maximum Average Runtime (max avg runtime)
max_avg_runtime = results_df['total_inference_time'].max()

# Compute the Model Score
model_score = (avg_f1) * 0.75 + ((max_avg_runtime - measured_avg_runtime) / max_avg_runtime) * 0.25

print(f"\nModel Score: {model_score:.4f}")





Processing aspect: Usability
Epoch 1/5, Loss: 0.6611
Epoch 2/5, Loss: 0.4158
Epoch 3/5, Loss: 0.2008
Epoch 4/5, Loss: 0.1013
Epoch 5/5, Loss: 0.0978
Aspect: Usability
Accuracy: 0.7270
F1 Score (Micro): 0.7270
F1 Score (Macro): 0.6670
F1 Score (Weighted): 0.7164
Total Inference Time: 0.410288 seconds
Total Samples: 575
Average Inference Time per Sample: 0.000714 seconds

Processing aspect: Performance
Epoch 1/5, Loss: 0.8976
Epoch 2/5, Loss: 0.4377
Epoch 3/5, Loss: 0.2137
Epoch 4/5, Loss: 0.0990
Epoch 5/5, Loss: 0.0711
Aspect: Performance
Accuracy: 0.7000
F1 Score (Micro): 0.7000
F1 Score (Macro): 0.6950
F1 Score (Weighted): 0.7062
Total Inference Time: 0.110245 seconds
Total Samples: 140
Average Inference Time per Sample: 0.000787 seconds

Processing aspect: Bug
Epoch 1/5, Loss: 0.7017
Epoch 2/5, Loss: 0.2348
Epoch 3/5, Loss: 0.1027
Epoch 4/5, Loss: 0.0323
Epoch 5/5, Loss: 0.0094
Aspect: Bug
Accuracy: 0.8553
F1 Score (Micro): 0.8553
F1 Score (Macro): 0.7367
F1 Score (Weighted): 0.8529