In [None]:
import math
import numpy as np
import random
import rbo
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
import dataframe_image as dfi
from scipy.stats import binom


In [None]:
def jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)

    return len(intersection) / len(union)

In [None]:
def generate_domains(n, jac_value):
    # first, we find the size of the intersection based on the given jaccard_value
    intersection_size = math.floor((jac_value * (2 * n)) / (1 + jac_value))

    set_a = set()
    set_b = set()
    
    for i in range(intersection_size):
        item_label = 'i' + str(i)
        set_a.add(item_label)
        set_b.add(item_label)

    # remaining number of elements
    
    a_rem = n - intersection_size
    b_rem = n - intersection_size

    for i in range(a_rem):
        set_a.add('a' + str(i))

    for i in range(b_rem):
        set_b.add('b' + str(i))

    

    return set_a, set_b

In [None]:
def agreement_probability(theta):
    # theta determines the level of top-weightedness.
    def f(depth, n):    
        d = depth - 1 # depth is 1 indexed while the function is 0 indexed
    
        k = (0.3 * n) / np.log(n)
        
        exp_term = np.exp(-1 * (d / k))
    
        uniform_term = 1 / n

        return (theta * exp_term) + ((1 - theta) * uniform_term)
        
    return f

    

In [None]:
def add_ties(X, frac_ties, num_groups):
    if frac_ties == 0:
        return X
    n = len(X)
    X_with_ties = []
    num_tied_items = math.floor(frac_ties * n)
    if (num_tied_items / 2) < num_groups:
        raise Error("Not enough groups")
        
    average_group_size = math.floor(num_tied_items / num_groups)
    
    i = 0
    
    actual_num_tied_items = 0
    actual_num_groups = 0
    
    while i < n:
        p = np.random.uniform()
        if p < (num_groups / n):
            tie_group_length = np.random.poisson(average_group_size - 2) + 2
    
            tie_group = []
            
            for j in range(tie_group_length):
                if i + j < n:
                    tie_group.append(S[i + j])
                    actual_num_tied_items += 1
    
            X_with_ties.append(tie_group)
            actual_num_groups += 1
            i += j + 1
        else:
            X_with_ties.append(X[i])
            i += 1

    return X_with_ties
    
    

In [None]:
def simulate_rankings(n, len_x, len_y, overlap_probability_function, frac_ties_x=0, n_groups_x=0, frac_ties_y=0, n_groups_y=0, conjointness=1, return_truncated=True):
    '''
    - the overlap function is a discrete probability function over the number of items in the domain. It takes as input the current depth d
    '''
    
    # generate the two domains depending on the degree of conjointness
    a, b = generate_domains(n, conjointness)

    S = []
    L = []
    case = []
    agree_probs = []

    cases = [1, 2, 3, 4]

    decision = 0

    
    for depth in range(1, n+1):
        # sample randomly from domains a and b without replacement

        # agree_prob = agreement_probability(depth = depth, theta = theta, n = n)
        agree_prob = overlap_probability_function(depth = depth, n = n) # possibly allow for the option of adding more parameters

        agree_probs.append(agree_prob)

        u = np.random.uniform()

        item_S_domain = None
        item_L_domain = None

        item_S = None
        item_L = None

        if u < agree_prob:
            # CASE 1: 1/4 of the time, choose some element that is in the intersection of S domain (a) and L so far
            # CASE 2: 1/4 of the time, choose some element that is in the intersection of L domain (b) and S so far
            # CASE 3: 1/4 of the time, choose some element that has not yet been taken - from the intersection a and b
            # CASE 4: 1/4 of the time, choose some element that is in both S and L and add to S and L
       
            decision = random.sample(cases, 1)[0]            
            
            if decision == 1:
                item_S_domain = a.intersection(set(L))
                item_L_domain = b

            elif decision == 2:
                item_S_domain = a
                item_L_domain = b.intersection(set(S))
                
            elif decision == 3:
                item_S_domain = a.intersection(b)
                item_L_domain = a.intersection(b)

                # already draw item for S and L to make sure they are the same

                if len(item_S_domain) > 0:
                    item_S = random.sample(sorted(item_S_domain), 1)[0]
                    item_L = item_S
                    
                
            elif decision == 4:
                item_S_domain = a.intersection(set(L))
                item_L_domain = b.intersection(set(S))

            else:
                raise Error("Invalid decision")
            

        else:
            item_S_domain = a
            item_L_domain = b
            decision = 0


        if item_S is None:
            if len(item_S_domain) > 0:
                item_S = random.sample(sorted(item_S_domain), 1)[0]
            else:
                item_S = random.sample(sorted(a), 1)[0]
                decision = 0

        if item_L is None:
            if len(item_L_domain) > 0:
                item_L = random.sample(sorted(item_L_domain), 1)[0]
            else:
                item_L = random.sample(sorted(b), 1)[0]
                decision = 0

        case.append(decision)


        S.append(item_S)
        L.append(item_L)

        a.remove(item_S)
        b.remove(item_L)


    S_with_ties = add_ties(S, frac_ties_x, n_groups_x)
    L_with_ties = add_ties(L, frac_ties_y, n_groups_y)
    
    # after rankings have been made, truncate

    if return_truncated:
        return S_with_ties[:len_x], L_with_ties[:len_y], agree_probs[:len_x], case[:len_x]

    return S_with_ties, L_with_ties, agree_probs, case

In [None]:
S, L, agree_probs, case_list = simulate_rankings(n=500, len_x = 40, len_y = 40, overlap_probability_function=agreement_probability(theta=1), conjointness=1,
                                                return_truncated=False)

def get_formatted_dataframe(S, L, agreement_probabilities, case_list):
    data = {
            'S' : S,
            'L' : L,
            'agreement_probability' : agreement_probabilities,
            'case' : case_list
    }
    
    def highlight_equal_values(x):
        color = 'background-color: green'
        default = ''
        # Comparison mask
        case1 = x['case'] == 1
        case2 = x['case'] == 2
        case3 = x['case'] == 3
        case4 = x['case'] == 4 
    
        if case1:
            return ['background-color: red','background-color: white', 'background-color: white', 'background-color: white']
        if case2:
            return ['background-color: white','background-color: purple', 'background-color: white', 'background-color: white']
        if case3:
            return ['background-color: #00A6D6','background-color: #00A6D6', 'background-color: white', 'background-color: white']
        if case4:
            return ['background-color: red','background-color: purple', 'background-color: white', 'background-color: white']
        else:
            return ['background-color: white'] * len(x)
    
    
    df = pd.DataFrame(data, index =  np.arange(1, np.max([len(S), len(L)]) + 1))
    df.index.name ='Rank'
    
    return df.style.apply(highlight_equal_values, axis=1)

In [None]:
get_formatted_dataframe(S, L, agree_probs, case_list)

Save image for the poster

In [None]:
# dfi.export(styled_df, 't0.5c1.png')

Find the average RBO scores

In [None]:
# theta_values = [0, 0.2, 0.4, 0.6, 0.8, 1.0]

# average_rbo_ext = []

# for theta in theta_values:
#     rbo_exts = []
#     for _ in range(100):
#         S, L, _, _ = simulate_rankings(n=200, len_x = 60, len_y = 60, overlap_probability_function=agreement_probability(theta=theta), conjointness=1)
#         # S, L, _ = simulate_rankings(n=200, len_x = 60, len_y = 60, conjointness=1, theta=theta)
#         rbo_ext = rbo.RankingSimilarity(S, L).rbo_ext(p=0.95)
#         rbo_exts.append(rbo_ext)
#     average_rbo_ext.append(np.mean(rbo_exts))


In [None]:
# rbo_ext_df = pd.DataFrame(
#     {
        
#         '$RBO_\text{ext}$' : average_rbo_ext
#     },
#     index = theta_values
# )

# rbo_ext_df.index.name = "$\\theta$"
# rbo_ext_df.T

In [None]:
# dfi.export(rbo_ext_df, 'rbo_ext_df.png')

In [None]:
def scaled_binomial_probability_function(p):
    def f(depth, n):
        largest_probability_x = n * p
        scale_factor = 1/binom.pmf(largest_probability_x, n, p)
        return (scale_factor * binom.pmf(depth, n, p))
        
    return f

In [None]:
S, L, agree_probs, case_list = simulate_rankings(n=1000, len_x = 0, len_y = 10, overlap_probability_function=scaled_binomial_probability_function(0.6), conjointness=1,
                                                return_truncated=False)

In [None]:
get_formatted_dataframe(S, L, agree_probs, case_list)