In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_multilabel_classification

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier

In [2]:
#n_labels - average number of labels per instance

n_samples = 10000
n_features = 20
n_classes = 5
n_labels = 3

X, y = make_multilabel_classification(n_samples = n_samples, n_features = n_features, 
                                      n_classes = n_classes, n_labels = n_labels, random_state = 0)
X.shape, y.shape

((10000, 20), (10000, 5))

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8000, 20), (2000, 20), (8000, 5), (2000, 5))

In [4]:
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

### Multilabel classification metrics

In [5]:
def get_exact_match(y_test, y_pred):
    return np.sum(np.sum(y_pred==y_test, axis=1)==5) / len(y_test)

exact_match = get_exact_match(y_test, y_pred)

print('Percentage of exactly correct predictions:', exact_match * 100, '%')

Percentage of exactly correct predictions: 28.95 %


Since it is difficult to get an exactly correct prediction i.e. all predicted labels for an example match with ground-truth labels (measured through exact-ratio), we look for metrics which award the model for partially correct results.

1. **Macro and micro measures of Precision and Recall**
2. **Hamming Loss** = 1 - Accuracy

In [6]:
def get_hamming_loss(y_test, y_pred):
    return np.sum(y_test!=y_pred) / (len(y_test) * n_classes)

hamming_loss = get_hamming_loss(y_test, y_pred)
accuracy = 1 - hamming_loss

hamming_loss, accuracy

(0.2457, 0.7543)

We can note that the exact match is roughly **29%** whereas accuracy of identifying labels correctly is **75%**.

In [7]:
def calculate_precision(y_true, y_pred):
    true_pos = np.sum((y_true==y_pred) & (y_pred==1))
    pred_pos = np.sum(y_pred==1)
    
    if pred_pos == 0:
        return 0
    else:
        return true_pos / pred_pos 
    
def calculate_recall(y_true, y_pred):
    true_pos = np.sum((y_true==y_pred) & (y_pred==1))
    actual_pos = np.sum(y_true==1)
    
    if actual_pos == 0:
        return 0
    else:
        return true_pos / actual_pos  

In [8]:
def get_macro_metrics(y_true, y_pred, n_classes):
    
    precs=[]
    recs=[]
    
    for c in range(n_classes):    
        precs.append(calculate_precision(y_true[:,c], y_pred[:,c]))
        recs.append(calculate_recall(y_true[:,c], y_pred[:,c]))

    return np.mean(precs), np.mean(recs)   

In [9]:
macro_prec, macro_rec = get_macro_metrics(y_test, y_pred, n_classes)

macro_f1 = 2 * macro_prec * macro_rec / (macro_prec + macro_rec)

macro_prec, macro_rec, macro_f1

(0.7448646326986749, 0.8154168705329793, 0.7785456489843088)

In [10]:
def get_confusion_matrix(y_true, y_pred):
    
    tp = np.sum((y_true==y_pred) & (y_pred==1))
    
    tn = np.sum((y_true==y_pred) & (y_pred!=1))
    
    fp = np.sum((y_true!=1) & (y_pred==1))
    
    fn = np.sum((y_true==1) & (y_pred!=1))
    
    return tp, tn, fp, fn

However, in case of multiple classes, there aren't positive or negative classes. And so, to calculate micro-measures, we need to look for correct and incorrect predictions only.

In [11]:
def correct_incorrect_predictions(y_true, y_pred):
    return np.sum(y_true==y_pred), np.sum(y_true!=y_pred)

In [12]:
outcomes=[]

for c in range(n_classes):  
    c, inc = correct_incorrect_predictions(y_test[:,c], y_pred[:,c])
    outcomes.append([c, inc])

outcomes=np.array(outcomes)

C, INC = np.sum(outcomes, axis=0)
C, INC

(7543, 2457)

In [13]:
def get_micro_metrics(y_true, y_pred, n_classes):
    
    outcomes=[]

    for c in range(n_classes):  
        c, inc = correct_incorrect_predictions(y_test[:,c], y_pred[:,c])
        outcomes.append([c, inc])

    outcomes=np.array(outcomes)

    C, INC = np.sum(outcomes, axis=0)
    
    actuals = len(y_true) * n_classes
    predictions = len(y_pred) * n_classes
    
    if (predictions==0):
        precision = 0
    else:
        precision = C / predictions
        
    if (actuals==0):
        recall = 0
    else:
        recall = C / (actuals)
        
    return precision, recall

In [14]:
micro_prec, micro_rec = get_micro_metrics(y_test, y_pred, n_classes)

micro_f1 = 2 * micro_prec * micro_rec / (micro_prec + micro_rec)

micro_prec, micro_rec, micro_f1, accuracy

(0.7543, 0.7543, 0.7543, 0.7543)

**We notice that micro measures (precision, recall and F1) are all equal to each other and to accuracy.**