In [274]:
import pandas as pd
from itertools import combinations
from nltk.metrics.agreement import AnnotationTask

In [237]:
def to_int(x):
    try: return x.astype(int)
    except (ValueError, TypeError, AttributeError): return x

In [238]:
answers = pd.read_csv("answers.csv", names = ["date_time", "email", "ling", "coref"]+list(range(1, 131))+["comments"])
answers.drop(0, axis=0, inplace=True)
answers.drop("date_time", axis=1, inplace=True)
answers = answers.groupby("email").last()
answers = answers.apply(to_int)

In [240]:
smaller = {2 : 1, 3 : 2, 4 : 2, 5 : 3, 6 : 3, 7 : 4}
small = answers.applymap(lambda x : smaller[x] if x in smaller.keys() else x)

In [244]:
def alpha_df(df):
    data = []
    for annotator in df.index:
        for i in range(1, 131):
            cell = df[i][annotator]
            if cell is not None : data.append((annotator, i, cell))
    return AnnotationTask(data=data).alpha()

In [245]:
alpha_df(answers)

0.16277009793305897

In [246]:
alpha_df(small)

0.1778868482161705

In [374]:
def trust_coefs(df):
    alphas = dict()
    for i in range(2, len(df.index)+1):
        for combination in combinations(df.index, i):
            alphas[combination] = alpha_df(df.loc[list(combination)])
    coefs = dict()
    coef = 1
    sorted_keys = [key for key, val in sorted(alphas.items(), key = lambda ele: ele[1])]
    for i, key in enumerate(sorted_keys):
        if alphas[key] > alphas[sorted_keys[i-1]] : coef += 1
        for annotator in key:
            try: coefs[annotator] += coef
            except KeyError: coefs[annotator] = coef
    m = max([coef for coef in coefs.values()])
    return {annotator : c/(coef+m) for annotator, c in coefs.items()}

In [372]:
def ident_bad_annot(df):
    for annotator, coef in trust_coefs(df).items():
        if coef <= 0.5: yield (annotator, coef)

In [375]:
dict(ident_bad_annot(answers)), dict(ident_bad_annot(small))

({}, {})

In [376]:
trust_coefs(answers), trust_coefs(small)

({'Frederique.bordignon@enpc.fr': 0.7868354960234407,
  'bergler@cse.concordia.ca': 0.7291753871912934,
  'anais.halftermeyer@univ-orleans.fr': 0.7192863122645459,
  'loic.grobol@gmail.com': 0.9871808287986605,
  'mariya': 0.830054416073671,
  'ilaine.wang@inalco.fr': 0.9865529510255336,
  'ygor.gallina@univ-nantes.fr': 0.8705525324403516,
  'sylvie.billot@univ-orleans.fr': 0.9463687735454165},
 {'Frederique.bordignon@enpc.fr': 0.7883015553144216,
  'bergler@cse.concordia.ca': 0.6835632718441585,
  'anais.halftermeyer@univ-orleans.fr': 0.694879346871286,
  'mariya': 0.7748669456931742,
  'ilaine.wang@inalco.fr': 0.9849118999638299,
  'loic.grobol@gmail.com': 0.987237120859815,
  'ygor.gallina@univ-nantes.fr': 0.9263680049604712,
  'sylvie.billot@univ-orleans.fr': 0.926471348111404})