In [4]:
import pandas as pd

data = {
    'Truth': [
        "String1_1", "String2_1|String2_2", "String1_1|String1_2|String1_3", "String1_1|String1_2", "String2_1|String2_2",
        "String1_1|String1_2", "String2_1|String2_2|String2_3", "String2_1|String2_2", "String1_1|String1_2|String1_3", "String2_1|String2_2",
        "String1_1|String1_2", "String2_1|String2_2"
    ] * 16,
    'Prediction': [
        "String1_1|String1_2|String1_3", "String2_1|String2_2", "String1_1|String1_2", "String2_1|String2_2", "String2_1|String2_2",
        "String1_1|String1_2", "String1_1|String1_2|String1_3", "String2_1|String2_2", "String1_1|String1_2", "String2_1|String2_2",
        "String1_1|String1_2", "String1_1|String1_2|String1_3"
    ] * 16,
    'Subtypes': ['A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B'] * 8 + ['C', 'C', 'C', 'C', 'C', 'C', 'D', 'D', 'D', 'D', 'D', 'D'] * 8,
}
df = pd.DataFrame(data)
df['ID'] = range(1, len(df) + 1)


In [5]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer


# Split the pipe-delimited values and convert them into lists of strings
df['Truth'] = df['Truth'].apply(lambda x: x.split('|'))
df['Prediction'] = df['Prediction'].apply(lambda x: x.split('|'))

# Function to compute confusion matrix for multi-label data
def multi_label_confusion_matrix(y_true, y_pred):
    n_labels = y_true.shape[1]
    cm = np.zeros((n_labels, 2, 2), dtype=int)
    for i in range(n_labels):
        cm[i] = confusion_matrix(y_true[:, i], y_pred[:, i])
    return cm

# Binarize the labels
mlb = MultiLabelBinarizer()
binarized_truth = mlb.fit_transform(df['Truth'])
binarized_prediction = mlb.transform(df['Prediction'])

# Group the data by subtypes
subtypes = df['Subtypes'].unique()
subtype_summaries = {}

# for subtype in subtypes:
#     mask = df['Subtypes'] == subtype
#     subtype_data = df[mask]
#     truth = binarized_truth[mask]
#     prediction = binarized_prediction[mask]
#     subtype_summaries[subtype] = {
#         "Data": subtype_data,
#         "Truth": truth,
#         "Prediction": prediction




def calculate_confusion(row):
    tp = set(row['Truth']).intersection(set(row['Prediction']))
    fp = set(row['Prediction']).difference(set(row['Truth']))
    fn = set(row['Truth']).difference(set(row['Prediction']))
    tn = set(mlb.classes_).difference(set(row['Truth'])).difference(set(row['Prediction']))
    return {'TP': tp, 'FP': fp, 'FN': fn, 'TN': tn}

# Group the data by subtypes
subtypes = df['Subtypes'].unique()
subtype_summaries = {}

for subtype in subtypes:
    mask = df['Subtypes'] == subtype
    subtype_data = df[mask]
    subtype_summaries[subtype] = subtype_data

# Calculate metrics and confusion matrix for each subtype
for subtype in subtypes:
    subtype_data = subtype_summaries[subtype]

    # Calculate confusion matrix
    confusion_data = subtype_data.apply(calculate_confusion, axis=1)
    
    # Calculate precision, recall, and F1-score
    tp = sum([len(x['TP']) for x in confusion_data])
    fp = sum([len(x['FP']) for x in confusion_data])
    fn = sum([len(x['FN']) for x in confusion_data])
    tn = sum([len(x['TN']) for x in confusion_data])
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2 * (precision * recall) / (precision + recall)
    
#     print(f'Precision for {subtype}: {precision}')
#     print(f'Recall for {subtype}: {recall}')
#     print(f'F1-score for {subtype}: {f1_score}')

    # Save the confusion matrix and original DataFrame rows for each class in a separate Excel file
    with pd.ExcelWriter(os.path.join(r'C:\Users\sigar\Documents\Test', f'{subtype}_metrics.xlsx')) as writer:
        pd.DataFrame({'Precision': [precision], 'Recall': [recall], 'F1-score': [f1_score]}).to_excel(writer, sheet_name='accuracy_score', index=False)
        subtype_data[confusion_data.apply(lambda x: bool(x['FN']))].to_excel(writer, sheet_name='FN', index=False)
        subtype_data[confusion_data.apply(lambda x: bool(x['FP']))].to_excel(writer, sheet_name='FP', index=False)
        subtype_data[confusion_data.apply(lambda x: bool(x['TP']))].to_excel(writer, sheet_name='TP', index=False)
        subtype_data[confusion_data.apply(lambda x: bool(x['TN']))].to_excel(writer, sheet_name='TN', index=False)
        
