In [1]:
import sys
import os

import numpy as np
import pandas as pd
import random
import math
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))

from simulate import simulate_rankings
from standard_functions import *
from scipy.stats import linregress

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'sigir2024-rbo', 'rbo', 'Python')))

from rbo import rbo

import matplotlib.pyplot as plt
import dataframe_image as dfi

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def get_formatted_dataframe(S, L, agreement_probabilities, case_list):
    data = {
            'S' : S,
            'L' : L,
            'overlap_probability' : agreement_probabilities,
            'case' : case_list
    }
    
    def highlight_equal_values(x):
        color = 'background-color: green'
        default = ''
        case1 = x['case'] == 1
        case2 = x['case'] == 2
        case3 = x['case'] == 3
        case4 = x['case'] == 4 
    
        if case1:
            return ['background-color: red','background-color: white', 'background-color: white', 'background-color: white']
        if case2:
            return ['background-color: white','background-color: purple', 'background-color: white', 'background-color: white']
        if case3:
            return ['background-color: #00A6D6','background-color: #00A6D6', 'background-color: white', 'background-color: white']
        if case4:
            return ['background-color: red','background-color: purple', 'background-color: white', 'background-color: white']
        else:
            return ['background-color: white'] * len(x)
    
    
    df = pd.DataFrame(data, index =  np.arange(1, np.max([len(S), len(L)]) + 1))
    df.index.name ='Depth'
    
    return df.style.apply(highlight_equal_values, axis=1)

In [14]:
def get_tied_dataframe(S, L, agreement_probability, case_list):

    S_flat = []
    L_flat = []
    S_colours = []
    L_colours = []

    def generate_random_colour():
        r = random.randint(0, 255)
        g = random.randint(0, 255)
        b = random.randint(0, 255)
        return f'#{r:02x}{g:02x}{b:02x}'
        

    for s in S:
        if isinstance(s, set) and len(s)>1:
            colour = generate_random_colour()
            for tied_item in s:
                S_flat.append(tied_item)
                S_colours.append(colour)
        else:
            S_flat.append(s)
            S_colours.append("white")

    for l in L:
        if isinstance(l, set) and len(l)>1:
            colour = generate_random_colour()
            for tied_item in l:
                L_flat.append(tied_item)
                L_colours.append(colour)
        else:
            L_flat.append(l)
            L_colours.append("white")
    
    data = {
            'S' : S_flat,
            'S_colour' : S_colours,
            'L' : L_flat,
            'L_colour' : L_colours,
            'overlap_probability' : agreement_probability,
            'case' : case_list
    }

    def colour_tied_items(x):
        return ['background-color: ' + x['S_colour'], 'background-color: white', 'background-color: ' + x['L_colour'], 'background-color: white', 'background-color: white', 'background-color: white']
        

    df = pd.DataFrame(data, index =  np.arange(1, np.max([len(S_flat), len(L_flat)]) + 1))
    df.index.name ='Depth'

    show = ['S', 'L', 'overlap_probability', 'case']


    styled_df = df.style.apply(colour_tied_items, axis=1)
    styled_df.hide([col for col in df.columns if col not in show], axis=1)
    return styled_df

## Examples

In [15]:
S, L, agree_probs, case_list = simulate_rankings(a=30, b=50, len_x = 15, len_y = 15, overlap_probability_function=exponential_decay(theta=1),
                                                 conjointness=0.4,
                                                truncate_rankings=True)

rbo_stats = rbo(S, L, p = 0.95)


df = get_formatted_dataframe(S, L, agree_probs, case_list)
df

Unnamed: 0_level_0,S,L,overlap_probability,case
Depth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,i7,b14,1.0,0
2,i11,b13,0.895167,0
3,a3,i7,0.801325,0
4,i0,i0,0.71732,3
5,i4,i13,0.642121,0
6,a2,b2,0.574806,0
7,i13,i4,0.514548,4
8,i5,b5,0.460606,0
9,a4,i5,0.41232,0
10,i15,i1,0.369095,0


In [None]:
dfi.export(df, 'exp_decay_rankings.png')

In [5]:
S, L, agree_probs, case_list = simulate_rankings(a=50, b=50, len_x = 15, len_y = 15,
                                                 conjointness=1,
                                                truncate_rankings=True)


df = get_formatted_dataframe(S, L, agree_probs, case_list)
df

Unnamed: 0_level_0,S,L,overlap_probability,case
Depth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,i47,i8,0,0
2,i25,i2,0,0
3,i43,i16,0,0
4,i48,i26,0,0
5,i13,i13,0,0
6,i2,i3,0,0
7,i27,i23,0,0
8,i41,i24,0,0
9,i15,i32,0,0
10,i33,i29,0,0


In [None]:
dfi.export(df, 'no_overlap_function_rankings.png')

In [24]:
S, L, agree_probs, case_list = simulate_rankings(a=10, b=10, len_x = 0, len_y = 0,
                                                 conjointness=1,
                                                 frac_ties_x=0.6,
                                                 n_groups_x=2,
                                                 frac_ties_y=0.3,
                                                 n_groups_y=1,
                                                 tie_probabilities_x=np.ones(10)/10,
                                                 tie_probabilities_y=np.ones(10)/10,
                                                truncate_rankings=False)


df = get_tied_dataframe(S, L, agree_probs, case_list)
df

Unnamed: 0_level_0,S,L,overlap_probability,case
Depth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,i6,i2,0,0
2,i9,i7,0,0
3,i8,i4,0,0
4,i3,i9,0,0
5,i5,i6,0,0
6,i0,i8,0,0
7,i1,i0,0,0
8,i7,i3,0,0
9,i4,i1,0,0
10,i2,i5,0,0


In [25]:
dfi.export(df, 'uniform_ties_rankings.png')

## Exponential Decay, $\theta = 0.5$ vs $\theta = 1.0$

In [6]:
def agreements_empirical(f, N, print_progress=False):
    agreement_average = np.ndarray((100, N))
    
    for trial in range(100):
        if print_progress:
            print("trial", trial)
            
        S, L, _, _ = simulate_rankings(a=N, b=N, len_x = 0, len_y = 0, overlap_probability_function=f, conjointness=1, truncate_rankings=False)
        for i in range(1, len(S)+1):
            intersection = set(S[:i]).intersection(set(L[:i]))
            agreement_average[trial][i-1] = len(intersection)/i

    return agreement_average

In [7]:
def agreements_analytical(f, N):
    xs = np.zeros(N)
    
    for d in range(1, N):
        prob_artificial_increase = f(d-1, N)
        expected_artificial_increase = prob_artificial_increase * (((0.5 * (d / N)) + 0.5) + (0.25 * (1 - np.exp(-N * (1 - (d/N))**2))) + (0.5) + ((1 - prob_artificial_increase) * (2*d)/N))
    
        prob_natural_increase = (1 - f(d-1, N))
        expected_natural_increase = prob_natural_increase * ((2 * d) / N)
    
        X_new = xs[d-1] + expected_artificial_increase + expected_natural_increase
        xs[d] = X_new
    
    
    agreements = [(xs[i] / (i+1)) for i in range(N)]
    return agreements


In [None]:
ae = agreements_empirical(zero(), 1000)

In [None]:
aa = agreements_analytical(zero(), 1000)

In [None]:
depths = np.arange(1, 1001)
ys = depths / 1000

plt.plot(depths, ae.mean(axis=0), 'y>', label='Average Empirical Agreement')
# plt.plot(depths, aa, 'r*', label='Analytical Agreement Estimate')
plt.plot(depths, ys, 'b-', label='Uniformly Random, $d/n$')
plt.legend()
plt.xlabel('Depth, $d$')
plt.ylabel('Agreement, $A_d$')
plt.title('No Overlap Probability Function, $n = 1000$')
plt.savefig('no_overlap_function.png')
plt.show()

In [None]:
ae_half = agreements_empirical(exponential_decay(theta=0.5), 1000)
ae_one = agreements_empirical(exponential_decay(theta=1.0), 1000)

In [None]:
depths = np.arange(1, 1001)
ys = depths / 1000

plt.plot(depths, ae_half.mean(axis=0), 'r*', label=r'Average Empirical Agreement, $\theta = 0.5$')
plt.plot(depths, ae_one.mean(axis=0), 'y>', label=r'Average Empirical Agreement, $\theta = 1.0$')
plt.plot(depths, ys, 'b-', label='Uniformly Random')
plt.legend()
plt.xlabel('Depth')
plt.ylabel('Agreement')
plt.title('Exponential Decay Overlap Probability Function, $n = 1000$')
plt.savefig('exponential_decay_probability_function.png')
plt.show()

## RBO for varying theta

In [None]:
thetas = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
raw_data = np.ndarray((6, 100))

for i, theta in enumerate(thetas):
    print("On theta:", theta)
    for j in range(100):   
        S, L, _, _ = simulate_rankings(a=1000, b=1000, len_x = 1000, len_y = 1000, overlap_probability_function=exponential_decay(theta=theta), conjointness=1,
                                                    truncate_rankings=False)
        rbo_stats = rbo(S, L, p = 0.95)
        rbo_ext = rbo_stats['ext']

        raw_data[i][j] = rbo_ext

In [None]:
data_dict = {
    "theta" : thetas,
    "mean": raw_data.mean(axis=1),
    "standard_deviation": raw_data.std(axis=1)
}


df = pd.DataFrame(data_dict)

plt.errorbar(thetas, raw_data.mean(axis=1), yerr=raw_data.std(axis=1), fmt='o', ecolor='r', capsize=5, label=r'Mean $\text{RBO}_{\text{ext}}$ with SD')


plt.title(r'Extrapolated RBO of Rankings Generated from Varying $\theta$')
plt.xlabel(r'$\theta$')
plt.ylabel(r'Mean $\text{RBO}_{\text{ext}}$')

plt.legend()

plt.savefig('varying_theta_rbo_ext.png')

plt.show()


In [None]:
slope, intercept, r_value, p_value, std_err = linregress(thetas, raw_data.mean(axis=1))

p_value

## Gaussian Empirical vs Analytical

In [None]:
ae = agreements_empirical(gaussian_distribution(mean=200, std_dev=30), 1000)
aa = agreements_analytical(gaussian_distribution(mean=200, std_dev=30), 1000)

In [None]:
depths = np.arange(1, 1001)
ys = depths / 1000

plt.plot(depths, ae.mean(axis=0), 'y>', label='Average Empirical Agreement')
plt.plot(depths, aa, 'r*', label='Analytical Agreement Estimate')
plt.plot(depths, ys, 'b-', label='Uniformly Random')
plt.legend()
plt.xlabel('Depth')
plt.ylabel('Agreement')
plt.title('N(200, 30) Overlap Probability Function')
plt.savefig('normal_overlap_probability_function.png')
plt.show()

Analytical solution to expected overlap/agreement

In [None]:
ae = agreements_empirical(exponential_decay(theta=0.6), 1000)
aa = agreements_analytical(exponential_decay(theta=0.6), 1000)

In [None]:
depths = np.arange(1, 1001)
ys = depths / 1000

plt.plot(depths, ae.mean(axis=0), 'y>', label='Average Empirical Agreement')
plt.plot(depths, aa, 'r*', label='Analytical Agreement Estimate')
plt.plot(depths, ys, 'b-', label='Uniformly Random')
plt.legend()
plt.xlabel('Depth')
plt.ylabel('Agreement')
plt.title('Exponential Decay Overlap Probability Function')
plt.show()

In [None]:
aa = agreements_analytical(gaussian_distribution(200, 9), 1000)
#ae = agreements_empirical(gaussian_distribution(200, 30), 1000)

In [None]:
depths = np.arange(1, 1001)
ys = depths / 1000

# plt.plot(depths, ae.mean(axis=0), 'y>', label='Average Empirical Agreement')
plt.plot(depths, aa, 'r*', label='Analytical Agreement Estimate')
plt.plot(depths, ys, 'b-', label='True')
plt.legend()
plt.xlabel('Depth')
plt.ylabel('Agreement')
plt.title('N(200, 30) Overlap Probability Function')
plt.show()

In [None]:
# qualitatitiver 

## Ties Analysis

In [None]:
# groups of 10

n = 2000

# uniform

probabilities_x = np.ones(n) / n
probabilities_y = np.ones(n) / n

tie_frequency = np.zeros((201, 100))

for k in range(100): 
    print("On iteration ", k)
    S, _, _, _ = simulate_rankings(a=n, b=n, len_x = 0, len_y = 0, overlap_probability_function=exponential_decay(theta=1), 
                                                               tie_probabilities_x=probabilities_x, conjointness=1,                                               
                                                     truncate_rankings=False, frac_ties_x=0.6, n_groups_x=40)
    
    depth = 1
    
    X = S
    
    for i in range(len(X)):
        if isinstance(X[i], list):
            for j in range(len(X[i])):
                depth_group = math.floor(depth / 10)
                tie_frequency[depth_group][k] += 1
                depth += 1
            i += (len(X[i]) + 1) 
        else:
            depth += 1

In [None]:
bins = np.arange(1, 202)

plt.xlabel('Ranking Section')
plt.ylabel('Number of tied items')
plt.title('Number of Tied Items in Each Section - Uniform Probability')
plt.bar(bins, tie_frequency.mean(axis = 1))
plt.savefig('tied_items_bar_uniform.png')
plt.show()

In [None]:
# groups of 10

n = 2000

prob_dist_x = norm(loc=(0.75 * n), scale=n/4)

depths = np.arange(1, n+1)
probabilities_x = prob_dist_x.pdf(depths)
    
probabilities_x = np.abs(probabilities_x)
probabilities_x /= probabilities_x.sum()


tie_frequency = np.zeros((201, 100))

for k in range(100): 
    print("On iteration ", k)
    S, _, _, _ = simulate_rankings(a=n, b=n, len_x = 0, len_y = 0, overlap_probability_function=exponential_decay(theta=1), 
                                                               tie_probabilities_x=probabilities_x, conjointness=1,                                               
                                                     truncate_rankings=False, frac_ties_x=0.6, n_groups_x=40)
    
    depth = 1
    
    X = S
    
    for i in range(len(X)):
        if isinstance(X[i], list):
            for j in range(len(X[i])):
                depth_group = math.floor(depth / 10)
                tie_frequency[depth_group][k] += 1
                depth += 1
            i += (len(X[i]) + 1) 
        else:
            depth += 1

In [None]:
bins = np.arange(1, 202)

plt.xlabel('Ranking Section')
plt.ylabel('Number of tied items')
plt.title(r'Number of Tied Items in Each Section - N(1500, 500)')
plt.bar(bins, tie_frequency.mean(axis = 1))
plt.savefig('tied_items_bar_normal.png')
plt.show()

In [None]:
S, L, _, _ = simulate_rankings(a=100, b=100, len_x = 0, len_y = 0, overlap_probability_function=exponential_decay(theta=1), 
                                                               tie_probabilities_x=np.ones(100)/100, conjointness=1,                                               
                                                     truncate_rankings=False, frac_ties_x=0.6, n_groups_x=10)

In [None]:
rbo(S, L, ties='b',p=0.95)

In [None]:
S