In [None]:
import sys
sys.path = [
    '../input/choix-034/local/choix-0.3.4/choix-0.3.4/',
] + sys.path


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
import choix


In [None]:
validation_data = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
comments = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

list_comments = comments["text"].unique()
validation_data = validation_data.query (  "less_toxic in @list_comments or more_toxic in @list_comments" ).copy()

text_encoder = LabelEncoder ()
texts =np.concatenate ( [comments["text"].values,validation_data["less_toxic"].values,validation_data["more_toxic"].values] )

text_encoder.fit(texts)
comments["encode_text"]=text_encoder.transform(comments["text"])
validation_data["less_toxic"]=text_encoder.transform(validation_data["less_toxic"])
validation_data["more_toxic"]=text_encoder.transform(validation_data["more_toxic"])

In [None]:
validation_data["text_a"] = validation_data.apply(lambda row: row["less_toxic"] if row["less_toxic"]  <  row["more_toxic"] else  row["more_toxic"],axis=1)
validation_data["text_b"] = validation_data.apply(lambda row: row["more_toxic"] if row["less_toxic"]  <  row["more_toxic"] else  row["less_toxic"],axis=1)
validation_data["win_a"] = validation_data.apply(lambda row: 0 if row["less_toxic"]  <  row["more_toxic"] else 1,axis=1)
validation_data["win_b"] = validation_data.apply(lambda row: 1 if row["less_toxic"]  <  row["more_toxic"] else 0,axis=1)

validation_data_agg = validation_data.groupby(["text_a","text_b"]).agg({"win_a":"sum","win_b":"sum"}).reset_index()
validation_data_agg["less_toxic"] = validation_data_agg.apply (lambda row: row["text_a"] if row["win_a"]< row["win_b"] else row["text_b"], axis=1 )
validation_data_agg["more_toxic"] = validation_data_agg.apply (lambda row: row["text_b"] if row["win_a"]< row["win_b"] else row["text_a"], axis=1 )

validation_data_agg.head()

## Bradley-Terry model with choix

[Bradley-Terry model](https://en.wikipedia.org/wiki/Bradley%E2%80%93Terry_model) assigns scores to a fixed set of items based on pairwise comparisons of these items.

For each individual *i* of the population the model discover a parameter *pi* that is a positive real-valued  that represent the score. 

In choix, items are represented by n consecutive integers *{0,…,n−1}*. The event "item i wins over item j" is represented by the Python tuple *(i, j)*.

Note that the winning item must always comes first in the tuple.

In [None]:
data = validation_data_agg[["more_toxic","less_toxic"]].values

n_items = len(text_encoder.classes_) 
data = [(x[0], x[1]) for x in data]

# Bradley-Terry model with I-LSR  (maximum-likelihood inference algorithm) 
params = choix.ilsr_pairwise(n_items, data, alpha=0.01)

rank = {}
for n, text in enumerate(np.argsort(params)):
    rank[text] = n

comments["score"] = comments["encode_text"].map(lambda x: rank[x])

In [None]:
less_toxic_count = validation_data.groupby(["less_toxic"]).agg ({"worker":"count"}).to_dict()["worker"]
more_toxic_count = validation_data.groupby(["more_toxic"]).agg ({"worker":"count"}).to_dict()["worker"]
comments["less_toxic_count"] = comments["encode_text"].map (lambda x: less_toxic_count.get(x, 0) )
comments["more_toxic_count"] = comments["encode_text"].map (lambda x: more_toxic_count.get(x, 0) )
comments["pairwise_count"] = comments["less_toxic_count"] + comments["more_toxic_count"] 
comments.head()

In [None]:
comments_encode_score = comments[["encode_text","score"]].copy()
comments_encode_score.index=comments_encode_score["encode_text"]
comments_encode_score = comments_encode_score.to_dict()["score"]

comments_validation_data = validation_data.query (  "less_toxic in @comments_encode_score.keys() and more_toxic in @comments_encode_score.keys()" ).copy()
comments_validation_data["score_less_toxic"] = comments_validation_data["less_toxic"].map(lambda x: comments_encode_score[x]  )
comments_validation_data["score_more_toxic"] = comments_validation_data["more_toxic"].map(lambda x: comments_encode_score[x]  )
comments_validation_data["pairwise_score"] = comments_validation_data["score_more_toxic"] >  comments_validation_data["score_less_toxic"]
score = comments_validation_data["pairwise_score"].mean() 

print (f"Average Agreement with Annotators on validation data:  {score:.5f}")

In [None]:
submission = comments[["comment_id","score"]]
submission.to_csv("submission.csv", index=False)
submission.head()