In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
df = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
txts = df["more_toxic"].unique().tolist() + df["less_toxic"].unique().tolist()
txts = list(set(txts))

In [None]:
!pip install choix -qq
import choix

## Convert pairwise datapoints to regression data

In [None]:
#https://github.com/lucasmaystre/choix/blob/3e1365e56978299f090a61581bb265cf92620258/notebooks/intro-pairwise.ipynb
def generate_pair_tuple(row):
    winner = txts.index(row["more_toxic"])
    loser = txts.index(row["less_toxic"])
    return (winner, loser)

pairs = df.progress_apply(generate_pair_tuple, axis=1).tolist()
params_mm = choix.mm_pairwise(len(txts), pairs, alpha=0.01)
print("MM done")
_ = """
#params_bfgs = choix.opt_pairwise(len(txts), pairs, alpha=0.01, method="BFGS")
#print("BFGS done")
#params_newton = choix.opt_pairwise(len(txts), pairs, alpha=0.01, method="Newton-CG") 
#print("Newton done")
params_lsr = choix.lsr_pairwise(len(txts), pairs, alpha=0.01)
print("LSR Sparse done")
params_ilsr = choix.ilsr_pairwise(len(txts), pairs, alpha=0.01)
print("ILSR Sparse done")
"""

## Bertweet cleanup

In [None]:
import wt_text_processing_utils as wtp_utils

txts = pd.Series(txts)
tclean = wtp_utils.preprocess_text(txts)

## Regression Op gen

In [None]:
txts = txts.to_frame()
txts.columns = ["txt"]
txts["tclean"] = tclean
txts["reg_rank_mm"] = params_mm #MM Converged properly for all non-contradictory datapoints. So use that. 

txts.to_csv("eval_regression.csv", index=False)

## Are there contradictory text pairs

In [None]:
dfpairs = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv") 
rankdf = dfpairs[["less_toxic", "more_toxic"]].drop_duplicates().copy().reset_index(drop=True)

rdf1 = pd.merge(rankdf, rankdf, how="left", left_on="less_toxic", right_on="more_toxic")
del rdf1["more_toxic_y"]
rdf1.rename(columns = {"less_toxic_x": "t2", "more_toxic_x": "t1", "less_toxic_y": "t3"}, inplace=True)
rdf1["t3"].fillna("", inplace=True)

rdf2 = pd.merge(rankdf, rankdf, how="left", left_on="more_toxic", right_on="less_toxic")
del rdf2["less_toxic_y"]
rdf2.rename(columns = {"more_toxic_x": "t2", "less_toxic_x": "t1", "more_toxic_y": "t3"}, inplace=True)
rdf2["t3"].fillna("", inplace=True)

cois = list(map(lambda i: f"t{i+1}", range(3)))
rdf = pd.concat([rdf1[cois], rdf2[cois]]).reset_index(drop=True)
rdf = rdf.drop_duplicates()
(rdf["t1"]==rdf["t3"]).sum(), (~rdf["t3"].isna()).sum(), rankdf.shape

cpairdf = rdf[rdf["t1"]==rdf["t3"]][["t1", "t2"]]
cpairdf["is_contra"] = True

dfpairs = pd.merge(dfpairs, cpairdf, left_on=["less_toxic", "more_toxic"], right_on=["t1", "t2"], how="left")
del dfpairs["t1"], dfpairs["t2"]
dfpairs["is_contra"].fillna(False, inplace=True)

dfpairs.to_csv("eval_rankable.csv", index=False)