In [None]:
import sys
import os

import numpy as np
import pandas as pd
import random
import math
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))

from simulate import simulate_rankings
from standard_distributions import *

import matplotlib.pyplot as plt

In [None]:
def get_formatted_dataframe(S, L, agreement_probabilities, case_list):
    data = {
            'S' : S,
            'L' : L,
            'overlap_probability' : agreement_probabilities,
            'case' : case_list
    }
    
    def highlight_equal_values(x):
        color = 'background-color: green'
        default = ''
        case1 = x['case'] == 1
        case2 = x['case'] == 2
        case3 = x['case'] == 3
        case4 = x['case'] == 4 
    
        if case1:
            return ['background-color: red','background-color: white', 'background-color: white', 'background-color: white']
        if case2:
            return ['background-color: white','background-color: purple', 'background-color: white', 'background-color: white']
        if case3:
            return ['background-color: #00A6D6','background-color: #00A6D6', 'background-color: white', 'background-color: white']
        if case4:
            return ['background-color: red','background-color: purple', 'background-color: white', 'background-color: white']
        else:
            return ['background-color: white'] * len(x)
    
    
    df = pd.DataFrame(data, index =  np.arange(1, np.max([len(S), len(L)]) + 1))
    df.index.name ='Rank'
    
    return df.style.apply(highlight_equal_values, axis=1)

In [None]:
def get_tied_dataframe(S, L, agreement_probability, case_list):

    S_flat = []
    L_flat = []
    S_colours = []
    L_colours = []

    def generate_random_colour():
        r = random.randint(0, 255)
        g = random.randint(0, 255)
        b = random.randint(0, 255)
        return f'#{r:02x}{g:02x}{b:02x}'
        

    for s in S:
        if isinstance(s, list) and len(s)>1:
            colour = generate_random_colour()
            for tied_item in s:
                S_flat.append(tied_item)
                S_colours.append(colour)
        else:
            S_flat.append(s)
            S_colours.append("white")

    for l in L:
        if isinstance(l, list) and len(l)>1:
            colour = generate_random_colour()
            for tied_item in l:
                L_flat.append(tied_item)
                L_colours.append(colour)
        else:
            L_flat.append(l)
            L_colours.append("white")
    
    data = {
            'S' : S_flat,
            'S_colour' : S_colours,
            'L' : L_flat,
            'L_colour' : L_colours,
            'overlap_probability' : agreement_probability,
            'case' : case_list
    }

    def colour_tied_items(x):
        return ['background-color: ' + x['S_colour'], 'background-color: white', 'background-color: ' + x['L_colour'], 'background-color: white', 'background-color: white', 'background-color: white']
        

    df = pd.DataFrame(data, index =  np.arange(1, np.max([len(S_flat), len(L_flat)]) + 1))
    df.index.name ='$d$'

    show = ['S', 'L', 'overlap_probability', 'case']


    styled_df = df.style.apply(colour_tied_items, axis=1)
    styled_df.hide([col for col in df.columns if col not in show], axis=1)
    return styled_df

## Examples

In [None]:
S, L, agree_probs, case_list = simulate_rankings(n=500, len_x = 500, len_y = 500, overlap_probability_function=exponential_decay(theta=1), conjointness=1,
                                                truncate_rankings=False)


get_formatted_dataframe(S, L, agree_probs, case_list)

In [None]:
n = 2000

prob_dist_x = norm(loc=(0.75 * n), scale=n/4)
prob_dist_y = norm(loc=(0.25 * n), scale=n/16)

depths = np.arange(1, n+1)
probabilities_x = prob_dist_x.pdf(depths)
probabilities_y = prob_dist_y.pdf(depths)
    
probabilities_x = np.abs(probabilities_x)
probabilities_x /= probabilities_x.sum()

probabilities_y = prob_dist_y.pdf(depths)
    
probabilities_y = np.abs(probabilities_y)
probabilities_y /= probabilities_y.sum()



S, L, agreement_probability, case_list = simulate_rankings(n=n, len_x = 0, len_y = 0, overlap_probability_function=exponential_decay(theta=1), 
                                                           tie_probabilities_x=probabilities_x, tie_probabilities_y=probabilities_y,conjointness=1,                                               
                                                 truncate_rankings=False, frac_ties_x=0.6, n_groups_x=5, frac_ties_y=0.6, n_groups_y=5)
get_tied_dataframe(S, L, agreement_probability, case_list)

## Ties Analysis

In [None]:
# groups of 100

tie_frequency = np.zeros(20)

depth = 1

for i in range(len(L)):
    if isinstance(L[i], list):
        for j in range(len(L[i])):
            depth_group = math.floor(depth / 100)
            tie_frequency[depth_group] += 1
            depth += 1
        i += (len(L[i]) + 1) 
    else:
        depth += 1
           

bins = np.arange(1, 21)

plt.xlabel('Depth Group')
plt.ylabel('Number of tied items')
plt.bar(bins,tie_frequency)

In [None]:
def zero():
    def f(depth, n):
        return 0
    return f

In [None]:
def agreements_empirical(f, N, print_progress=False):
    agreement_average = np.ndarray((100, N))
    
    for trial in range(100):
        if print_progress:
            print("trial", trial)
            
        S, L, _, _ = simulate_rankings(n=N, len_x = 0, len_y = 0, overlap_probability_function=f, conjointness=1, truncate_rankings=False)
        for i in range(1, len(S)+1):
            intersection = set(S[:i]).intersection(set(L[:i]))
            agreement_average[trial][i-1] = len(intersection)/i

    return agreement_average

In [None]:
def agreements_analytical(f, N):
    xs = np.zeros(N)
    
    for d in range(1, N):
        prob_artificial_increase = f(d-1, N)
        expected_artificial_increase = prob_artificial_increase * (((0.5 * (d / N)) + 0.5) + (0.25 * (1 - np.exp(-N * (1 - (d/N))**2))) + (0.5) + ((1 - prob_artificial_increase) * (2*d)/N))
    
        prob_natural_increase = (1 - f(d-1, N))
        expected_natural_increase = prob_natural_increase * ((2 * d) / N)
    
        X_new = xs[d-1] + expected_artificial_increase + expected_natural_increase
        xs[d] = X_new
    
    
    agreements = [(xs[i] / (i+1)) for i in range(N)]
    return agreements


In [None]:
ae = agreements_empirical(zero(), 1000)

In [None]:
aa = agreements_analytical(zero(), 1000)

In [None]:
depths = np.arange(1, 1001)
ys = depths / 1000

plt.plot(depths, ae.mean(axis=0), 'y>', label='Average Empirical Agreement')
# plt.plot(depths, aa, 'r*', label='Analytical Agreement Estimate')
plt.plot(depths, ys, 'b-', label='Uniformly Random, $d/n$')
plt.legend()
plt.xlabel('Depth, $d$')
plt.ylabel('Agreement, $A_d$')
plt.title('No Overlap Probability Function, $n = 1000$')
plt.savefig('no_overlap_function.png')
plt.show()

In [None]:
ae_1 = agreements_empirical(exponential_decay(theta=1), 1000)
ae_half = agreements_empirical(exponential_decay(theta=0.5), 1000)
# aa = agreements_analytical(exponential_decay(theta=1), 1000)

In [None]:
depths = np.arange(1, 1001)
ys = depths / 1000

plt.plot(depths, ae_1.mean(axis=0), 'y>', label=r'Average Empirical Agreement, $\theta = 1$')

plt.plot(depths, ae_half.mean(axis=0), 'r*', label=r'Average Empirical Agreement, $\theta = 0.5$')


# plt.plot(depths, aa, 'r*', label='Analytical Agreement Estimate')
plt.plot(depths, ys, 'b-', label='Uniformly Random, $d/n$')
plt.legend()
plt.xlabel('Depth, $d$')
plt.ylabel('Agreement, $A_d$')
plt.title(r'Exponential Decay Overlap Probability Function, n = 1000$')
plt.savefig('exponential_decay_overlap_function.png')
plt.show()

Analytical solution to expected overlap/agreement

In [None]:
ae = agreements_empirical(exponential_decay(theta=0.6), 1000)
aa = agreements_analytical(exponential_decay(theta=0.6), 1000)

In [None]:
depths = np.arange(1, 1001)
ys = depths / 1000

plt.plot(depths, ae.mean(axis=0), 'y>', label='Average Empirical Agreement')
plt.plot(depths, aa, 'r*', label='Analytical Agreement Estimate')
plt.plot(depths, ys, 'b-', label='Uniformly Random')
plt.legend()
plt.xlabel('Depth')
plt.ylabel('Agreement')
plt.title('Exponential Decay Overlap Probability Function')
plt.show()

In [None]:
aa = agreements_analytical(gaussian_distribution(200, 9), 1000)
#ae = agreements_empirical(gaussian_distribution(200, 30), 1000)

In [None]:
depths = np.arange(1, 1001)
ys = depths / 1000

# plt.plot(depths, ae.mean(axis=0), 'y>', label='Average Empirical Agreement')
plt.plot(depths, aa, 'r*', label='Analytical Agreement Estimate')
plt.plot(depths, ys, 'b-', label='True')
plt.legend()
plt.xlabel('Depth')
plt.ylabel('Agreement')
plt.title('N(200, 30) Overlap Probability Function')
plt.show()

In [None]:
# qualitatitiver 