# Evaluation #
### O(nlogn) solution, based on the [notebook provided](https://www.kaggle.com/code/ryanholbrook/competition-metric-kendall-tau-correlation)

In [None]:
def count_inversions_slowly(ranks):
    inversions = 0
    size = len(ranks)
    for i in range(size):
        for j in range(i+1, size):
            if ranks[i] > ranks[j]:
                total += 1
    return total

This implementation is much faster, though theoretically also \\(O(n^2)\\). (You might enjoy reviewing other inversion counting algorithms from [this StackOverflow post](https://stackoverflow.com/a/47845960).)

In [None]:
from bisect import bisect
# Actually O(N^2), but fast in practice for our data
def count_inversions2(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):  # O(N)
        j = bisect(sorted_so_far, u)  # O(log N)
        inversions += i - j
        sorted_so_far.insert(j, u)  # O(N)
    return inversions

O(nlogn) algo taken from this [SO answer](https://stackoverflow.com/a/23201616/14301931)

In [None]:
def count_inversions(a):
    res = 0
    counts = [0]*(len(a)+1)
    rank = { v : i+1 for i, v in enumerate(sorted(a)) }
    for x in reversed(a):
        i = rank[x] - 1
        while i:
            res += counts[i]
            i -= i & -i
        i = rank[x]
        while i <= len(a):
            counts[i] += 1
            i += i & -i
    return res

In [None]:
from numpy.random import randint
l = randint(0,100,10**6)
l

In [None]:
%%time
print(count_inversions(list(l)))

In [None]:
%%time
print(count_inversions2(list(l)))

In [None]:
# import matplotlib.pyplot as plt
# import numpy as np
# import timeit
# import math
# import random
# ns = np.linspace(10, 100000, 50, dtype=int)
# ts = [timeit.timeit('count_inversions(list(randint(0,100,{})))'.format(n),'from __main__ import count_inversions\nfrom numpy.random import randint', number=100)
#       for n in ns]
# ts2 = [timeit.timeit('count_inversions2(list(randint(0,100,{})))'.format(n),'from __main__ import count_inversions2\nfrom numpy.random import randint', number=100)
#       for n in ns]
# plt.plot(ns, ts, 'or')
# plt.plot(ns, ts2, 'ob')

To compute the Kendall tau correlation, we sum up the inversions across all predictions and also the worst-case number of inversions across all predictions, and apply the formula following formula:
\\[K = 1 - 4 \frac{\sum_i S_{i}}{\sum_i n_i(n_i - 1)}\\]
where \\(S_i\\) is the number of inversions in the predicted ranks and \\(n_i\\) is the number of cells for notebook \\(i\\).

In [None]:
def kendall_tau(ground_truth, predictions):
    total_inversions = 0  # total inversions in predicted ranks across all instances
    total_2max = 0  # maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max