# Competition Metric - Kendall Tau Correlation

https://www.kaggle.com/code/ryanholbrook/competition-metric-kendall-tau-correlation/notebook



In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import nltk

First we need to find out the inversion number of our solution

In [1]:
from bisect import bisect


# Actually O(N^2), but fast in practice for our data
def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):  # O(N)
        j = bisect(sorted_so_far, u)  # O(log N)
        inversions += i - j
        sorted_so_far.insert(j, u)  # O(N)
    return inversions

Here is the formula, where $S_i$  is the number of inversions in the predicted ranks and $n_i$ is the number of cells for notebook $i$

$K=1-4 \frac{\sum_i S}{\sum_i n (n-1)}$ 

In [2]:
def kendall_tau(ground_truth, predictions):
    total_inversions = 0  # total inversions in predicted ranks across all instances
    total_2max = 0  # maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max