In [11]:
import numpy as np

# Function to calculate confusion matrix from true and predicted labels
def calculate_confusion_matrix(true_labels, predicted_labels, class_labels):
    """
    Computes the confusion matrix for multiclass classification.
    Args:
        true_labels (list): True class labels.
        predicted_labels (list): Predicted class labels.
        class_labels (list): List of all possible class labels.
    Returns:
        np.ndarray: Confusion matrix with shape (num_classes, num_classes).
    """
    num_classes = len(class_labels)
    conf_matrix = np.zeros((num_classes, num_classes), dtype=int)
    class_to_index = {label: idx for idx, label in enumerate(class_labels)}

    for true, pred in zip(true_labels, predicted_labels):
        conf_matrix[class_to_index[true], class_to_index[pred]] += 1

    return conf_matrix

# Function to calculate precision, recall, and F1-score for each class
def calculate_class_metrics(conf_matrix):
    """
    Computes precision, recall, and F1-score for each class from the confusion matrix.
    Args:
        conf_matrix (np.ndarray): Confusion matrix.
    Returns:
        dict: Dictionary with metrics for each class.
    """
    metrics = {}
    for i in range(len(conf_matrix)):
        true_positive = conf_matrix[i, i]
        predicted_positive = conf_matrix[:, i].sum()
        actual_positive = conf_matrix[i, :].sum()

        precision = true_positive / predicted_positive if predicted_positive > 0 else 0
        recall = true_positive / actual_positive if actual_positive > 0 else 0
        f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        metrics[i] = {
            "precision": precision,
            "recall": recall,
            "f1_score": f1_score,
            "support": actual_positive,
        }

    return metrics

# Function to calculate overall accuracy
def calculate_accuracy(conf_matrix):
    """
    Computes the overall accuracy from the confusion matrix.
    Args:
        conf_matrix (np.ndarray): Confusion matrix.
    Returns:
        float: Overall accuracy.
    """
    correct_predictions = np.trace(conf_matrix)
    total_predictions = conf_matrix.sum()
    return correct_predictions / total_predictions

# Function to calculate macro and weighted averages
def calculate_macro_weighted_averages(metrics, total_samples):
    """
    Computes macro and weighted averages for precision, recall, and F1-score.
    Args:
        metrics (dict): Metrics for each class.
        total_samples (int): Total number of samples.
    Returns:
        dict: Dictionary with macro and weighted averages.
    """
    macro_precision = np.mean([m["precision"] for m in metrics.values()])
    macro_recall = np.mean([m["recall"] for m in metrics.values()])
    macro_f1 = np.mean([m["f1_score"] for m in metrics.values()])

    weighted_precision = sum(m["precision"] * m["support"] for m in metrics.values()) / total_samples
    weighted_recall = sum(m["recall"] * m["support"] for m in metrics.values()) / total_samples
    weighted_f1 = sum(m["f1_score"] * m["support"] for m in metrics.values()) / total_samples

    return {
        "macro": {"precision": macro_precision, "recall": macro_recall, "f1_score": macro_f1},
        "weighted": {"precision": weighted_precision, "recall": weighted_recall, "f1_score": weighted_f1},
    }

# Example usage
# Simulate some true labels and predicted labels for a 3-class classification task
true_labels = np.random.choice(['Class A', 'Class B', 'Class C'], size=100, p=[0.4, 0.35, 0.25])
predicted_labels = np.random.choice(['Class A', 'Class B', 'Class C'], size=100, p=[0.4, 0.35, 0.25])


class_labels = ['Class A', 'Class B', 'Class C']
conf_matrix_custom = calculate_confusion_matrix(true_labels, predicted_labels, class_labels)
metrics_custom = calculate_class_metrics(conf_matrix_custom)
accuracy_custom = calculate_accuracy(conf_matrix_custom)
averages_custom = calculate_macro_weighted_averages(metrics_custom, len(true_labels))

In [10]:
pd.DataFrame(averages_custom)

Unnamed: 0,macro,weighted
precision,0.325116,0.344628
recall,0.337703,0.33
f1_score,0.324712,0.33237


In [8]:
accuracy_custom

0.33

In [7]:
pd.DataFrame(metrics_custom)

Unnamed: 0,0,1,2
precision,0.363636,0.378378,0.233333
recall,0.285714,0.358974,0.368421
f1_score,0.32,0.368421,0.285714
support,42.0,39.0,19.0


In [5]:
conf_matrix_custom

array([[12, 17, 13],
       [15, 14, 10],
       [ 6,  6,  7]])

In [4]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pandas as pd 

# Calculate confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels, labels=['Class A', 'Class B', 'Class C'])

# Generate classification report
class_report = classification_report(true_labels, predicted_labels, target_names=['Class A', 'Class B', 'Class C'], output_dict=True)

# Calculate accuracy
accuracy = accuracy_score(true_labels, predicted_labels)

# Convert confusion matrix and report to DataFrame for better readability
conf_matrix_df = pd.DataFrame(conf_matrix, 
                              index=['Actual Class A', 'Actual Class B', 'Actual Class C'], 
                              columns=['Predicted Class A', 'Predicted Class B', 'Predicted Class C'])

class_report_df = pd.DataFrame(class_report).transpose()

conf_matrix_df, class_report_df, accuracy

(                Predicted Class A  Predicted Class B  Predicted Class C
 Actual Class A                 12                 17                 13
 Actual Class B                 15                 14                 10
 Actual Class C                  6                  6                  7,
               precision    recall  f1-score  support
 Class A        0.363636  0.285714  0.320000    42.00
 Class B        0.378378  0.358974  0.368421    39.00
 Class C        0.233333  0.368421  0.285714    19.00
 accuracy       0.330000  0.330000  0.330000     0.33
 macro avg      0.325116  0.337703  0.324712   100.00
 weighted avg   0.344628  0.330000  0.332370   100.00,
 0.33)

In [12]:
# Function to calculate Character Error Rate (CER)
def calculate_character_error_rate(true_strings, predicted_strings):
    """
    Computes the Character Error Rate (CER).
    Args:
        true_strings (list of str): List of ground truth strings.
        predicted_strings (list of str): List of predicted strings.
    Returns:
        float: Character Error Rate (CER).
    """
    total_characters = 0
    total_errors = 0

    for true, pred in zip(true_strings, predicted_strings):
        total_characters += len(true)
        total_errors += levenshtein_distance(true, pred)

    return total_errors / total_characters if total_characters > 0 else 0

# Function to calculate Word Error Rate (WER)
def calculate_word_error_rate(true_strings, predicted_strings):
    """
    Computes the Word Error Rate (WER).
    Args:
        true_strings (list of str): List of ground truth strings.
        predicted_strings (list of str): List of predicted strings.
    Returns:
        float: Word Error Rate (WER).
    """
    total_words = 0
    total_errors = 0

    for true, pred in zip(true_strings, predicted_strings):
        true_words = true.split()
        pred_words = pred.split()
        total_words += len(true_words)
        total_errors += levenshtein_distance(true_words, pred_words)

    return total_errors / total_words if total_words > 0 else 0

# Helper function to calculate Levenshtein distance
def levenshtein_distance(seq1, seq2):
    """
    Computes the Levenshtein distance between two sequences.
    Args:
        seq1 (str or list): First sequence.
        seq2 (str or list): Second sequence.
    Returns:
        int: Levenshtein distance.
    """
    len_seq1, len_seq2 = len(seq1), len(seq2)
    dp = np.zeros((len_seq1 + 1, len_seq2 + 1), dtype=int)

    for i in range(len_seq1 + 1):
        dp[i][0] = i
    for j in range(len_seq2 + 1):
        dp[0][j] = j

    for i in range(1, len_seq1 + 1):
        for j in range(1, len_seq2 + 1):
            if seq1[i - 1] == seq2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])

    return dp[len_seq1][len_seq2]

# Function to calculate Perplexity
def calculate_perplexity(probabilities):
    """
    Computes the Perplexity score.
    Args:
        probabilities (list of float): List of predicted probabilities for the true labels.
    Returns:
        float: Perplexity score.
    """
    n = len(probabilities)
    log_sum = sum(np.log(p) for p in probabilities if p > 0)  # Avoid log(0)
    return np.exp(-log_sum / n) if n > 0 else float('inf')

# Example usage for CER, WER, and Perplexity
true_strings = ["hello world", "machine learning", "openai"]
predicted_strings = ["helo world", "machine learn", "openia"]
probabilities = [0.8, 0.7, 0.9]  # Example probabilities of the true labels

cer = calculate_character_error_rate(true_strings, predicted_strings)
wer = calculate_word_error_rate(true_strings, predicted_strings)
perplexity = calculate_perplexity(probabilities)

cer, wer, perplexity


(0.18181818181818182, 0.6, 1.2565790685485896)