In [53]:
import numpy as np
import random
import re
files = ['100k.tsv']

In [117]:
urls, judgements, p = {}, {}, []
for file in files:
    with open(file,'r') as f:
        for l in f:
            l = l.strip().split('\t')
            if l[14] not in urls:
                urls[l[14]] = 0
            urls[l[14]] += 1
for file in files:
    with open(file,'r') as f:
        for l in f:
            l = l.strip().split('\t')
            if urls[l[14]] > 1 and len(l) > 16:
                if l[14] not in judgements and 'error' not in l[16].strip().lower():
                    judgements[l[14]] = []
                if 'error' not in l[16].strip().lower(): 
                    currentJudgements = []
                    for phrase in l[16:]:
                        currentJudgements.append(set(re.sub(' +', ' ',phrase.strip()).split(' ')))
                    judgements[l[14]].append(currentJudgements)

In [110]:
def getScore(candidate, gold):
    #Unigram Levenshtein distance
    #First we produce all possible pairs and greedily select 
    scoring, bestMatch = {}, {}
    maxScore = 0
    maxLabel = ''
    #Generate all possible combinations
    for goldLabel in gold:
        goldKey = str(goldLabel)
        scoring[goldKey] = {}
        for candidateLabel in candidate:
            candidateKey = str(candidateLabel)
            scoring[goldKey][candidateKey] = (len(goldLabel) - len(goldLabel-candidateLabel))/len(goldLabel)
    #Greedily select best combination and then remove all related combinations.
    while len(scoring) > 0:
        maxScore = 0
        maxLabel = ''
        for goldLabel in scoring:
            goldKey = str(goldLabel)
            for candidateLabel in scoring[goldKey]:
                candidateKey = str(candidateLabel)
                score = scoring[goldKey][candidateKey]
                if score >= maxScore:
                    maxScore = score
                    maxLabel = (goldKey, candidateKey)
        bestMatch[maxLabel] = scoring[maxLabel[0]][maxLabel[1]]
        scoring.pop(maxLabel[0])#remove all pairs that could
    return sum(bestMatch.values())/len(gold)      

In [83]:
#Boostrapping 
randomDependent = []
for i in range(10000):
    scores = []
    for url in judgements:
        goldIndex = random.randrange(0,len(judgements[url]))
        for i in range(len(judgements[url])):
            if goldIndex != i:
                scores.append(getScore(judgements[url][goldIndex],judgements[url][i]))
    randomDependent.append(np.mean(scores))
print(np.max(randomDependent))
print(np.mean(randomDependent))
print(np.min(randomDependent))

0.6470502645502646
0.5764470687830688
0.49829365079365073


In [118]:
#First Judge as Gold
scores = []
for url in judgements:
    for i in range(len(judgements[url][1:])):
            scores.append(getScore(judgements[url][0],judgements[url][i]))
print(np.max(scores))
print(np.mean(scores))
print(np.min(scores))

1.0
0.6900396825396826
0.0


In [119]:
#Highest Agreement
#This form tends to skew towards a perfect score not because judges always get it right but because the judge with the highest agreement tended to only submit a single keyphrase. Since we are selecting the best match greedily judges tend to have agreement with this single keyphrase in at least one of their other keyphrases
scores = []
for url in judgements:
    goldIndex = random.randrange(0,len(judgements[url]))
    maxScore = 0
    maxSequence = []
    for i in range(len(judgements[url])):
        currentRuns = []
        for j in range(len(judgements[url])):
            if j != i:
                currentRuns.append(getScore(judgements[url][j],judgements[url][i]))
        currentScore = np.mean(currentRuns)
        if maxScore < currentScore:
            maxScore = currentScore
            maxSequence = currentRuns
    scores += maxSequence
print(np.max(scores))
print(np.mean(scores))
print(np.min(scores))

1.0
0.7731746031746032
0.0


In [123]:
#Each Judgment vs All other judgments
scores = []
for url in judgements:
    for i in range(len(judgements[url])):
        currentRuns = []
        for j in range(len(judgements[url])):
            if j != i:
                currentRuns.append(getScore(judgements[url][i],judgements[url][j]))
        if len(currentRuns) > 0:
            scores.append(np.sum(currentRuns)/(len(judgements[url])-1))
print(np.max(scores))
print(np.mean(scores))
print(np.min(scores))

1.0
0.5766466750841751
0.0


In [126]:
len(scores)

264