In [None]:
import math
import numpy as np
import random
import rbo
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
import dataframe_image as dfi

In [None]:
def jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)

    return len(intersection) / len(union)

In [None]:
def generate_domains(a, b, jac_value):
    # first, we find the size of the intersection based on the given jaccard_value
    intersection_size = math.floor((jac_value * (a + b)) / (1 + jac_value))

    set_a = set()
    set_b = set()
    
    for i in range(intersection_size):
        item_label = 'i' + str(i)
        set_a.add(item_label)
        set_b.add(item_label)

    # remaining number of elements
    
    a_rem = a - intersection_size
    b_rem = b - intersection_size

    for i in range(a_rem):
        set_a.add('a' + str(i))

    for i in range(b_rem):
        set_b.add('b' + str(i))

    

    return set_a, set_b

In [None]:
def agreement_probability(theta):
    # theta determines the level of top-weightedness.
    def f(depth, n):    
        d = depth - 1 # depth is 1 indexed while the function is 0 indexed
    
        k = (0.3 * n) / np.log(n)
        
        exp_term = np.exp(-1 * (d / k))
    
        uniform_term = 1 / n

        return (theta * exp_term) + ((1 - theta) * uniform_term)
        
    return f

    

In [None]:
def simulate_rankings(n, len_x, len_y, overlap_probability_function, conjointness=1, return_truncated=True):
    '''
    - the overlap function is a discrete probability function over the number of items in the domain. It takes as input the current depth d
    '''
    
    # generate the two domains depending on the degree of conjointness
    a, b = generate_domains(n, n, conjointness)

    S = []
    L = []
    case = []
    agree_probs = []
    
    for depth in range(1, n+1):
        # sample randomly from domains a and b without replacement

        # agree_prob = agreement_probability(depth = depth, theta = theta, n = n)
        agree_prob = overlap_probability_function(depth = depth, n = n) # possibly allow for the option of adding more parameters

        agree_probs.append(agree_prob)

        u = np.random.uniform()

        if u < agree_prob:


            # CASE 1: 1/3 of the time, choose some element that is in the intersection of S domain (a) and L so far
            # CASE 2: 1/3 of the time, choose some element that is in the intersection of L domain (b) and S so far
            # CASE 3: 1/3 of the time, choose some element that has not yet been taken - from the intersection a and b

            cases = [1, 2, 3]

            intersection = None
            
            decision = random.sample(cases, 1)[0]
            

            if decision == 1:
                intersection = a.intersection(set(L))

            elif decision == 2:
                intersection = b.intersection(set(S))
            else:
                intersection = a.intersection(b)
        
            if len(intersection) > 0:
                item = random.sample(sorted(intersection), 1)[0]

                if decision == 1:
                    other_item = random.sample(sorted(b), 1)[0]
                    
                    S.append(item)
                    L.append(other_item)

                    case.append(1)
                    
                    
                    a.remove(item)
                    b.remove(other_item)
                    
                elif decision == 2:
                    other_item = random.sample(sorted(a), 1)[0]

                    S.append(other_item)
                    L.append(item)

                    case.append(2)

                    a.remove(other_item)
                    b.remove(item)
                    

                else:
                    S.append(item)
                    L.append(item)

                    case.append(3)

                    a.remove(item)
                    b.remove(item)

            else:
                item_a = random.sample(sorted(a), 1)[0]
                item_b = random.sample(sorted(b), 1)[0]
        

                S.append(item_a)
                L.append(item_b)

                case.append(0)

                a.remove(item_a)
                b.remove(item_b)            
        else:
            item_a = random.sample(sorted(a), 1)[0]
            item_b = random.sample(sorted(b), 1)[0]

            S.append(item_a)
            L.append(item_b)

            case.append(0)

            a.remove(item_a)
            b.remove(item_b)    

    
    # after rankings have been made, truncate

    S_truncated = S[:len_x]

    L_truncated = L[:len_y]

    if return_truncated:
        return S_truncated, L_truncated, agree_probs[:len_x], case[:len_x]
    else:
        return S, L, agree_probs, case

In [None]:
S, L, agree_probs, case_list = simulate_rankings(n=500, len_x = 40, len_y = 40, overlap_probability_function=agreement_probability(theta=0), conjointness=1,
                                                return_truncated=False)
data = {
        'S' : S,
        'L' : L,
        'agreement_probability' : agree_probs,
        'case' : case_list
       }

def highlight_equal_values(x):
    color = 'background-color: green'
    default = ''
    # Comparison mask
    case1 = x['case'] == 1
    case2 = x['case'] == 2
    case3 = x['case'] == 3

    if case1:
        return ['background-color: green','background-color: white', 'background-color: white', 'background-color: white']
    if case2:
        return ['background-color: white','background-color: green', 'background-color: white', 'background-color: white']
    if case3:
        return ['background-color: #00A6D6','background-color: #00A6D6', 'background-color: white', 'background-color: white']
    else:
        return ['background-color: white'] * len(x)


df = pd.DataFrame(data, index =  np.arange(1, np.max([len(S), len(L)]) + 1))
df.index.name ='Rank'

styled_df = df.style.apply(highlight_equal_values, axis=1)
print("RBO", rbo.RankingSimilarity(S, L).rbo(p=0.95))
styled_df

Save image for the poster

In [None]:
# dfi.export(styled_df, 't0.5c1.png')

Find the average RBO scores

In [None]:
theta_values = [0, 0.2, 0.4, 0.6, 0.8, 1.0]

average_rbo_ext = []

for theta in theta_values:
    rbo_exts = []
    for _ in range(1000):
        S, L, _, _ = simulate_rankings(n=200, len_x = 60, len_y = 60, overlap_probability_function=agreement_probability(theta=theta), conjointness=1)
        # S, L, _ = simulate_rankings(n=200, len_x = 60, len_y = 60, conjointness=1, theta=theta)
        rbo_ext = rbo.RankingSimilarity(S, L).rbo_ext(p=0.95)
        rbo_exts.append(rbo_ext)
    average_rbo_ext.append(np.mean(rbo_exts))


In [None]:
rbo_ext_df = pd.DataFrame(
    {
        
        '$RBO_\text{ext}$' : average_rbo_ext
    },
    index = theta_values
)

rbo_ext_df.index.name = "$\\theta$"
rbo_ext_df.T

In [None]:
dfi.export(rbo_ext_df, 'rbo_ext_df.png')

Finding $Q_d$ for completely independent rankings which are fully conjoint

In [None]:
def get_num_agreements(x, y):
    total = 0
    for i in range(len(x)):
        if x[i] == y[i]:
            total += 1
    return total

In [None]:
Qs = []

a, b = generate_domains(1000, 1000, 1)
A = list(a)
B = list(b)

for i in range(1000):
    random.shuffle(A)
    random.shuffle(B)

    num_agreements = get_num_agreements(A, B)
    Qs.append(num_agreements)

In [None]:
frequency = Counter(Qs)

# Prepare data for plotting
categories = list(frequency.keys())
counts = list(frequency.values())


In [None]:
plt.bar(categories, counts, color='green')  # Customize color as needed

very top weighted

S: i1 i2 i3 i4 i5 i8 i9 i10
<br>
L: i1 i2 i3 i4 i5 i9 i10 i8


In [None]:
random.sample([1, 2, 3], 1)