## Mallows on Debain 2002

In [1]:
import numpy as np
from tqdm import tqdm_notebook
import math
import itertools
import readPreflib
import metropolis

The probability of a given ranking under the mallows model given a reference ranking sigma and a dispersion parameter phi is as follows:

$$ P(r | \sigma , \phi) = \frac{1}{Z} e^{-\phi \Delta}$$

where

$$ Z = 1 \cdot (1 + e^{-\phi}) \cdot (1 + e^{-\phi} + e^{-\phi^2})  \dots (1 + \dots + e^{\phi^{m-1}})       $$

where

$$ m = |r| $$

In [2]:
def mallowsProb(r, sigma, phi):
    return (1.0 / Z(phi, len(r)-1) * math.exp(-1.0 * phi * kt(r, sigma)))

# Normalization
def Z(phi, m):
    product = 1
    for i in range(1, m):
        part = 0
        for j in range(0,i):
            part += math.exp(-1 * (phi ** j))
        product *= part
    return product   

# Kendall-Tau Distance
def kt(a, b):
    unique_set =  np.unique(np.concatenate([a,b]))
    pairs = itertools.combinations(unique_set, 2)
    count = 0.0
    for i, j in pairs:
        unknown = False
        try:
            first = np_index(a, i) - np_index(a, j)
            secnd = np_index(b, i) - np_index(b, j)
        except:
            unknown = True
            count += 0.5
        if not unknown and (first * secnd < 0):
            count += 1
    return count

In [3]:
a = np.asarray([1,2,3,4])
b = np.asarray([2,3,4,1])
mallowsProb(a, b, 0.7)

0.38506033431401987

There are votes in the data that are incomplete. We store a vector with the probabily of each length:

In [5]:
candidates, length_counts, votes = readPreflib.soiInputwithWeights('data_input/ED-debian-2002.soi')
candidates

{1: 'Branden Robinson',
 2: 'Raphael Hertzog',
 3: 'Bdale Garbee',
 4: 'None Of The Above'}

In [6]:
length_probs = []
total_votes = 1.0 * sum(length_counts.values())
for i in range(1,len(length_counts.values())+1):
    length_probs.append(length_counts[i] / total_votes)
    
def probLength(n):
    return length_probs[n-1]

length_probs

[0.04, 0.04631578947368421, 0.26526315789473687, 0.6484210526315789]

Read in the data

The votes come in as tuples that look like
- (5, [1,2,3,4,5])
- (2, [4,2,1,3])

The second term in the tuple is a vote, and the first term is the number of terms that vote occurs.
Therefore, the sum of the probabilities of all votes in a dataset given a mallows model is the following:

In [7]:
def mallowsCost(params, dataset):
    central_ranking, phi = params
    cost = 0
    for tup in dataset:
        num_occurances, r = tup
        cost += probLength(len(r)) * num_occurances * mallowsProb(r, central_ranking, phi)
    return cost

mallowsCost(([1,2,3,4],0.5),votes)

144.98275312432824

We need functions to generate new candidates for the Metropolis algorithm

In [99]:
def generateMallowsCandidate(params):
    sigma, phi = params
    new_sigma = generateSigma(sigma)
    new_phi = generatePhi(phi)
    return [new_sigma, new_phi]

def generateSigma(order):
    tuning_parameter = 0.25
    a = np.random.randint(len(order))
    b = np.random.randint(len(order))
    order[a], order[b] = order[b], order[a]
    while (np.random.uniform(0.0, 1.0) >= tuning_parameter):
        a = np.random.randint(len(order))
        b = np.random.randint(len(order))
        order[a], order[b] = order[b], order[a]
    return order

def generatePhi(phi):
    alpha = 0.2 # tuning param
    delta = np.random.uniform(-1 * alpha,alpha)
    new = phi + delta
    while(new <= 0):
        delta = np.random.uniform(-1 * alpha,alpha)
        new = phi + delta
    return new

We are ready to find the parameters of the Mallow's model on the data

In [100]:
def run():
    initial_sigma = np.asarray([1,2,3,4])
    initial_params = [initial_sigma, 1.0]
    params, cost = metropolis.maximize(mallowsCost, initial_params, generateMallowsCandidate, votes, 100000)
    sigma, phi = params
    print('Central Ranking \t', sigma)
    print('Dispersion Parameter \t', '%2.7f' % phi)

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




Save these to disk

In [102]:
def save():
    import pickle

    pickle.dump(params, open('pickle/mallows2002_100k.p','wb'))