## Mallows on Debain 2002

In [2]:
import numpy as np
from tqdm import tqdm_notebook
import math
import itertools
import readPreflib
import metropolis

The probability of a given ranking under the mallows model given a reference ranking sigma and a dispersion parameter phi is as follows:

$$ P(r | \sigma , \phi) = \frac{1}{Z} e^{-\phi \Delta}$$

where

$$ Z = 1 \cdot (1 + e^{-\phi}) \cdot (1 + e^{-\phi} + e^{-\phi^2})  \dots (1 + \dots + e^{\phi^{m-1}})       $$

where

$$ m = |r| $$

In [3]:
def mallowsProb(r, sigma, phi):
    return (1.0 / Z(phi, len(r)-1) * math.exp(-1.0 * phi * kt(r, sigma)))

# Normalization
def Z(phi, m):
    product = 1
    for i in range(1, m):
        part = 0
        for j in range(0,i):
            part += math.exp(-1 * (phi ** j))
        product *= part
    return product   

# Kendall-Tau Distance
def kt(a, b):
    unique_set =  np.unique(np.concatenate([a,b]))
    pairs = itertools.combinations(unique_set, 2)
    count = 0.0
    for i, j in pairs:
        unknown = False
        try:
            first = np_index(a, i) - np_index(a, j)
            secnd = np_index(b, i) - np_index(b, j)
        except:
            unknown = True
            count += 0.5
        if not unknown and (first * secnd < 0):
            count += 1
    return count

In [4]:
a = np.asarray([1,2,3,4])
b = np.asarray([2,3,4,1])
mallowsProb(a, b, 0.7)

0.38506033431401987

There are votes in the data that are incomplete. We store a vector with the probabily of each length:

In [6]:
candidates, lengths, votes = readPreflib.soiInputwithWeights('netflix_soc/ED-00004-00000001.soc')
candidates, lengths, votes

({1: 'Shrek (Full-screen)', 2: 'The X-Files: Season 2', 3: 'The Punisher'},
 defaultdict(int, {3: 664}),
 [(263, [2, 1, 3]),
  (249, [1, 2, 3]),
  (78, [1, 3, 2]),
  (46, [2, 3, 1]),
  (17, [3, 1, 2]),
  (11, [3, 2, 1])])

In [21]:
def getLengthProbs(length_counts):
    length_probs = []
    total_votes = 1.0 * sum(length_counts.values())
    for i in range(1,len(length_counts.values())+1):
        length_probs.append(length_counts[i] / total_votes)
    return length_probs
    
def probLength(lengths, n):
    return lengths[n-1]


Read in the data

The votes come in as tuples that look like
- (5, [1,2,3,4,5])
- (2, [4,2,1,3])

The second term in the tuple is a vote, and the first term is the number of terms that vote occurs.
Therefore, the sum of the probabilities of all votes in a dataset given a mallows model is the following:

In [13]:
def mallowsCost(params, dataset, lengths):
    central_ranking, phi = params
    cost = 0
    for tup in dataset:
        num_occurances, r = tup
        cost += probLength(lengths, len(r)) * num_occurances * mallowsProb(r, central_ranking, phi)
    return cost

def mallowsCostComplete(params, dataset, lengths):
    central_ranking, phi = params
    cost = 0
    for tup in dataset:
        num_occurances, r = tup
        cost += num_occurances * mallowsProb(r, central_ranking, phi)
    return cost

# lens = getLengthProbs(length_counts)
# mallowsCost(([1,2,3,4],0.5),votes, lens)

We need functions to generate new candidates for the Metropolis algorithm

In [8]:
def generateMallowsCandidate(params):
    sigma, phi = params
    new_sigma = generateSigma(sigma)
    new_phi = generatePhi(phi)
    return [new_sigma, new_phi]

def generateSigma(order):
    tuning_parameter = 0.25
    a = np.random.randint(len(order))
    b = np.random.randint(len(order))
    order[a], order[b] = order[b], order[a]
    while (np.random.uniform(0.0, 1.0) >= tuning_parameter):
        a = np.random.randint(len(order))
        b = np.random.randint(len(order))
        order[a], order[b] = order[b], order[a]
    return order

def generatePhi(phi):
    alpha = 0.2 # tuning param
    delta = np.random.uniform(-1 * alpha,alpha)
    new = phi + delta
    while(new <= 0):
        delta = np.random.uniform(-1 * alpha,alpha)
        new = phi + delta
    return new

We are ready to find the parameters of the Mallow's model on the data

In [9]:
def runMallows(rankings, n_runs, lengths_vector, complete=True):
    # print(lengths_vector)
    lengths = getLengthProbs(lengths_vector)
    num_alternatives = len(lengths)
    if complete:
        num_alternatives = list(lengths_vector.keys())[0]
    initial_sigma = np.arange(num_alternatives) + 1
    initial_params = [initial_sigma, 1.0]
    costfunc = mallowsCost
    if complete:
        costfunc = mallowsCostComplete
    params, cost = metropolis.maximize(costfunc, lengths, initial_params, generateMallowsCandidate, rankings, n_runs)
    sigma, phi = params
    return params

In [10]:
# params = runMallows(votes, 1000, lens)

Save these to disk

In [11]:
def save():
    import pickle

    pickle.dump(params, open('pickle/mallows2002_1mil_2.p','wb'))

# save()

In [12]:
params = [np.array([3, 1, 4, 2]), 0.09938076871319058]