In [1]:
import pandas as pd
import numpy as np
import random
import pickle

import xmltodict, json
import json
import itertools
import ftfy
import glob, os
import zlib

In [2]:
def process(s):
    return ftfy.fix_text(str(s)).lower()

In [3]:
disceval_base_path = "/data/sileo/libs/git/DiscEval/" # replace the path of your DiscEval repository
disceval_data = f"{disceval_base_path}/data/" # make sure you ran get_data.bash
disceval_path = f"{disceval_base_path}/disceval/"

switchboard_labels_path=f"{disceval_data}/SwitchBoard_labels.tsv"
mrda_labels_path=f"{disceval_data}/MRDA_labels.tsv"

In [4]:
tasks = "PDTB GUM STAC MRDA SwitchBoard Emergent Sarcasm Verifiability".split()
tasks = tasks+ [f"Persuasiveness-{c}" for c in ["Specificity", "Eloquence", "Relevance", "ClaimType", "PremiseType", "Strength"]]
tasks = tasks + [f"Squinky-{c}" for c in ["Formality","Informativeness","Implicature"]]
tasks = tasks+ [f"EmoBank-{c}" for c in ["Arousal", "Dominance","Valence"]]
tasks

['PDTB',
 'GUM',
 'STAC',
 'MRDA',
 'SwitchBoard',
 'Emergent',
 'Sarcasm',
 'Verifiability',
 'Persuasiveness-Specificity',
 'Persuasiveness-Eloquence',
 'Persuasiveness-Relevance',
 'Persuasiveness-ClaimType',
 'Persuasiveness-PremiseType',
 'Persuasiveness-Strength',
 'Squinky-Formality',
 'Squinky-Informativeness',
 'Squinky-Implicature',
 'EmoBank-Arousal',
 'EmoBank-Dominance',
 'EmoBank-Valence']

In [5]:
for t in tasks:
    t_path=f"{disceval_path}/{t}"
    if not os.path.exists(t_path):
        os.makedirs(t_path)

In [6]:
cols_pairs = ["sentence1", "sentence2","label"]

# PDTB

In [7]:
df_pdtb = pd.read_csv(f"{disceval_data}/pdtb2.csv")
df_pdtb.loc[df_pdtb.Relation=="EntRel", "ConnHeadSemClass1"] = "Entrel"
df_pdtb = df_pdtb[df_pdtb.ConnHead.isnull()]
df_pdtb = df_pdtb[["Arg1_RawText", "Arg2_RawText", "ConnHeadSemClass1", "Section", "Relation"]]

#standard PDTB split
df_pdtb.loc[df_pdtb.Section.between(2,20), "cv"] = "train"
df_pdtb.loc[df_pdtb.Section.between(0,1), "cv"] = "dev"
df_pdtb.loc[df_pdtb.Section.between(21,22), "cv"] = "test"

df_pdtb = df_pdtb[~df_pdtb["ConnHeadSemClass1"].isna()]
df_pdtb["sentence1"] = df_pdtb.Arg1_RawText
df_pdtb["sentence2"] = df_pdtb.Arg2_RawText
df_pdtb["label_fine"] = df_pdtb["ConnHeadSemClass1"].map(lambda s:(str(s)+".None").split(".")[1])
df_pdtb["label_coarse"]=df_pdtb["ConnHeadSemClass1"].map(lambda s:str(s).split(".")[0])


  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
np.random.seed(0)
task="PDTB"

df_fine = df_pdtb[df_pdtb.label_fine!="None"].copy()
df_fine["label"]= df_fine["label_fine"]

labels=set(df_fine.label)
pd.Series(list(labels)).to_csv(f"{disceval_path}/{task}/labels", sep="\t", index=None)

for cv in ["dev","train","test"]:
    df = df_fine[df_fine.cv==cv]
    df["sentence1"]=df["sentence1"].map(process)
    df["sentence2"]=df["sentence2"].map(process)
    df = df[cols_pairs].sample(frac=1.0)
    df.to_csv(f"{disceval_path}/{task}/{cv}.tsv", index=False, sep="\t")

  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


# GUM

In [9]:
np.random.seed(0)
task="GUM"

l=[]

for path in glob.glob(f"/{disceval_data}/gum/rst/*.rs3"):
    
    file = path.split("/")[-1].replace(".rs3","")
    s=open(path).read()
    o = xmltodict.parse(s)
    
    id_to_text={}
    for x in o["rst"]["body"]["segment"]:
        id_to_text[x["@id"]]=x["#text"]
        
    attached=[]
    for x in o["rst"]["body"]["segment"]:
        if x["@parent"] not in id_to_text:
            continue
        l+=[{
            "sentence2":x["#text"],
            "sentence1":id_to_text[x["@parent"]],
            "label":x["@relname"],
            "file":file,
            "hash":zlib.adler32(file.encode('utf-8')) 
        }]
        attached+=[(x["@parent"],x["@id"])]
    
    unattached=[]
    for x,y in list(itertools.product(*[id_to_text.keys()]*2)):
        if x==y:
            continue
        if (x,y) not in attached:
            unattached+=[{"sentence1": id_to_text[x],
                 "sentence2": id_to_text[y],
                 "label":"no_relation",
                 "file":file,
                 "hash":zlib.adler32(file.encode('utf-8')) 
                }]
    l+=list(np.random.choice(unattached,int(len(attached)/4)))
            
df=pd.DataFrame(l)

labels=set(df.label)
pd.Series(list(labels)).to_csv(f"{disceval_path}/{task}/labels", sep="\t", index=None)


df["hash"]=df.hash.rank(method="min")

# split according to dialogs
hashes = sorted(list(set(df.hash)))
s08, s09 = hashes[int(len(hashes)*0.8)], hashes[int(len(hashes)*0.9)]

df["sentence1"] = df["sentence1"].map(process)
df["sentence2"] = df["sentence2"].map(process)

labels=set(df.label)
pd.Series(list(labels)).to_csv(f"{disceval_path}/{task}/labels", sep="\t", index=None)

df[df.hash<s08][cols_pairs].to_csv(f"{disceval_path}/{task}/train.tsv", sep="\t",index=False)
df[(s08<=df.hash)&(df.hash<s09)][cols_pairs].to_csv(f"{disceval_path}/{task}/dev.tsv", sep="\t",index=False)
df[s09<=df.hash][cols_pairs].to_csv(f"{disceval_path}/{task}/test.tsv", sep="\t",index=False)



# STAC

In [10]:
np.random.seed(0)
task="STAC"

for part in ["train","test"]:
    df = pd.read_pickle(f"{disceval_data}/stac_data_pickles/candidates_{part}_dropeeu_d10_ALLRELS.pkl")

    df=df[df.source_type=="Segment"]
    df=df[df.source_emitter!="Server"]
    df=df[df.target_emitter!="Server"]
    df["label"]=df.relation_type
    n_relations = int((df.label!="no_relation").sum()/4)
    df=pd.concat([df[df.label!="no_relation"], df[df.label=="no_relation"].sample(n_relations)]).sample(frac=1.0)
    df["sentence1"]=df.source_text
    df["sentence2"]=df.target_text

    df["sentence1"]=df["sentence1"].map(process)
    df["sentence2"]=df["sentence2"].map(process)
    df = df.drop_duplicates(subset=["sentence1","sentence2"], keep=False)

    if part=="train":
        cv = np.arange(len(df))/len(df)
        np.random.shuffle(cv)
        df[cv<0.9][cols_pairs].to_csv(f"{disceval_path}/{task}/train.tsv", sep="\t", index=False)
        df[cv>0.9][cols_pairs].to_csv(f"{disceval_path}/{task}/dev.tsv", sep="\t", index=False)
        labels=set(df.label)
        pd.Series(list(labels)).to_csv(f"{disceval_path}/{task}/labels", sep="\t", index=None)
    else:
        df[cols_pairs].to_csv(f"{disceval_path}/{task}/test.tsv", sep="\t", index=False)




# Sarcasm

In [11]:
np.random.seed(0)
task="Sarcasm"

df = pd.read_csv(f"{disceval_data}/sarcasm_v2.csv")
df["sentence1"] = df["Quote Text"]
df["sentence2"] = df["Response Text"]
df["label"] = df["Label"]

df["sentence1"]=df["sentence1"].map(process)
df["sentence2"]=df["sentence2"].map(process)

cv = np.arange(len(df))/len(df)
np.random.shuffle(cv)
df = df[["sentence1","sentence2","label"]].sample(frac=1.0)

labels=set(df.label)
pd.Series(list(labels)).to_csv(f"{disceval_path}/{task}/labels", sep="\t", index=None)

df[cv<0.8][cols_pairs].to_csv(f"{disceval_path}/{task}/train.tsv", sep="\t", index=False)
df[(0.8<cv) & (cv<0.9)][cols_pairs].to_csv(f"{disceval_path}/{task}/dev.tsv", sep="\t", index=False)
df[0.9<cv][cols_pairs].to_csv(f"{disceval_path}/{task}/test.tsv", sep="\t", index=False)



# Emergent

In [12]:
np.random.seed(0)
task="Emergent"

df = pd.read_csv(f"{disceval_data}/url-versions-2015-06-14-clean.csv")

df["sentence2"] = df["articleHeadline"]
df["sentence1"] = df["claimHeadline"]
df["label"] = df["articleHeadlineStance"]

df["sentence1"]=df["sentence1"].map(process)
df["sentence2"]=df["sentence2"].map(process)

cv = np.arange(len(df))/len(df)
np.random.shuffle(cv)
df = df[["sentence1","sentence2","label"]].sample(frac=1.0)

labels=set(df.label)
pd.Series(list(labels)).to_csv(f"{disceval_path}/{task}/labels", sep="\t", index=None)

df[cv<0.8][cols_pairs].to_csv(f"{disceval_path}/{task}/train.tsv", sep="\t", index=False)
df[(0.8<cv) & (cv<0.9)][cols_pairs].to_csv(f"{disceval_path}/{task}/dev.tsv", sep="\t", index=False)
df[0.9<cv][cols_pairs].to_csv(f"{disceval_path}/{task}/test.tsv", sep="\t", index=False)



# Persuasiveness

In [13]:
categories=["Specificity", "Eloquence", "Relevance", "ClaimType", "PremiseType", "Strength"]

l=[]
for path in glob.glob(f"{disceval_data}/essays_all/essay*.ann"):
    df=pd.read_csv(path,sep="\t",names=["id","ann","text"])
    id_to_text = df[df.id.str.contains("T")].set_index("id")["text"].to_dict()
    parents=df[df.ann.str.contains("Arg")].ann.map(lambda x: {x.split()[1].split(":")[-1]:x.split()[2].split(":")[-1] })
    parents=list(parents)
    parents={k: v for d in parents for k, v in d.items()}
    id_to_text[None]=None
    for x in df[df.ann.map(lambda x: any(a in x for a in categories))].ann:
        category, t_id, value=x.split()

        s2=id_to_text[t_id]
        s1=id_to_text[parents.get(t_id,None)]
        l+=[{"sentence1":s1,
             "sentence2":s2,
             "category":category,
             "label":value,
            }]
df_l=pd.DataFrame(l)
df_l=df_l[df_l.sentence1.map(lambda x:x!=None)]

In [14]:
np.random.seed(0)

for c in categories:
    task=f"Persuasiveness-{c}"

    path_c = f"{disceval_data}/{task}"
    if not os.path.isdir(path_c):
        os.mkdir(path_c)
    df=df_l[df_l.category==c]
    if max([len(x) for x in set(df.label)])==1:
        df["label"]=df["label"].map(float)
        try:
            df["label"] = pd.qcut(df["label"], q=3, labels=["low","-","high"])
        except:
            df["label"] = pd.qcut(df["label"], q=3, labels=["low","high"],duplicates="drop")
        df=df[df.label!="-"]
    df["sentence1"] = df.sentence1.map(process)
    df["sentence2"] = df.sentence2.map(process)
    
    labels=set(df.label)
    pd.Series(list(labels)).to_csv(f"{disceval_path}/{task}/labels", sep="\t", index=None)
    
    cv = np.arange(len(df))/len(df)
    np.random.shuffle(cv)
    
    df[cv<0.8][cols_pairs].to_csv(f"{disceval_path}/{task}/train.tsv", sep="\t", index=False)
    df[(0.8<cv) & (cv<0.9)][cols_pairs].to_csv(f"{disceval_path}/{task}/dev.tsv", sep="\t", index=False)
    df[0.9<cv][cols_pairs].to_csv(f"{disceval_path}/{task}/test.tsv", sep="\t", index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in th

In [15]:
cols = ["sentence","label"]

# Switchboard

In [16]:
np.random.seed(0)
task="SwitchBoard"

label_to_name = pd.read_csv(switchboard_labels_path,sep="\t").set_index("y")["label"].to_dict()

l=[]
for cv in ["train","dev","test"]:
    df=pd.read_csv(f"/{disceval_data}/Switchboard-Corpus/swda_data/{cv}_set.txt",sep="|", names=["locutor","sentence","y"])
    df["sentence"]=df["sentence"].map(process)
    df["label"]=df["y"].map(lambda x:label_to_name[x])
    df["cv"]=cv
    l+=[df]
    
df=pd.concat(l)
df=df.drop_duplicates(subset=["sentence"], keep=False)


n_non_statements = df.label.map(lambda x :"Statement" not in x).sum()//4
df=pd.concat([df[df.label.map(lambda x :"Statement" not in x)],
              df[df.label.map(lambda x :"Statement" in x)].sample(n_non_statements)]).sample(frac=1.0)

labels=set(df.label)
pd.Series(list(labels)).to_csv(f"{disceval_path}/{task}/labels", sep="\t", index=None)

for cv in ["train","test"]:
    if cv=="train":
        rnd = np.arange(len(df))/len(df)
        np.random.shuffle(rnd)
        df[(rnd<0.9) & (df.cv=="train")][cols].to_csv(f"{disceval_path}/{task}/train.tsv", sep="\t", index=False)
        df[(rnd>=0.9) & (df.cv=="train")][cols].to_csv(f"{disceval_path}/{task}/dev.tsv", sep="\t", index=False)
    else:
        df[df.cv=="test"][cols].to_csv(f"{disceval_path}/{task}/test.tsv", sep="\t", index=False)




# MRDA

In [17]:
np.random.seed(0)
task="MRDA"

label_to_name = pd.read_csv(mrda_labels_path,sep="\t").set_index("y")["label"].to_dict()

l=[]
for cv in ["train","dev","test"]:
    df=pd.read_csv(f"{disceval_data}/MRDA-Corpus/mrda_data/{cv}_set.txt",sep="|",
                names=["locutor","sentence","y_coarse","y_medium","y_fine"])
    
    df["y"]=df["y_fine"]
    df["sentence"]=df["sentence"].map(process)
    df["label"]=df["y"].map(lambda x:label_to_name[x])
    df["cv"]=cv
    l+=[df]
    
df=pd.concat(l)
df=df.drop_duplicates(subset=["sentence"], keep=False)


n_non_statements = (df.label!="Statement").sum()//4
df=pd.concat([df[df.label!="Statement"], df[df.label=="Statement"].sample(n_non_statements)]).sample(frac=1.0)

labels=set(df.label)
pd.Series(list(labels)).to_csv(f"{disceval_path}/{task}/labels", sep="\t", index=None)

#good_labels = (df[df.cv=="test"].label.value_counts()>9).to_dict()
#df=df[df.label.map(lambda x:good_labels[x])]
df.label
for cv in ["train","test"]:
    if cv=="train":
        rnd = np.arange(len(df))/len(df)
        np.random.shuffle(rnd)
        df[(rnd<0.9) & (df.cv=="train")][cols].to_csv(f"{disceval_path}/{task}/train.tsv", sep="\t", index=False)
        df[(rnd>0.9) & (df.cv=="train")][cols].to_csv(f"{disceval_path}/{task}/dev.tsv", sep="\t", index=False)
    else:
        df[df.cv=="test"][cols].to_csv(f"{disceval_path}/{task}/test.tsv", sep="\t", index=False)




# Squinky

In [18]:
np.random.seed(0)

for c in ['Formality', 'Informativeness', 'Implicature']:
    
    df = pd.read_csv(f"{disceval_data}/mturk_merged.csv")
    df["label"] = pd.qcut(df[c.lower()], q=3, labels=["low","-","high"])
    df=df[df.label!="-"]
    
    labels=set(df.label)
    task=f"Squinky-{c}"
    pd.Series(list(labels)).to_csv(f"{disceval_path}/{task}/labels", sep="\t", index=None)
    
    df["sentence"] = df["sentence"].map(process)
    
    cv = np.arange(len(df))/len(df)
    np.random.shuffle(cv)
    df[cv<0.8][cols].to_csv(f"{disceval_path}/{task}/train.tsv", sep="\t", index=False)
    df[(0.8<cv) & (cv<0.9)][cols].to_csv(f"{disceval_path}/{task}/dev.tsv", sep="\t", index=False)
    df[0.9<cv][cols].to_csv(f"{disceval_path}/{task}/test.tsv", sep="\t", index=False)

  # This is added back by InteractiveShellApp.init_path()


# Verifiability

In [19]:
np.random.seed(0)
task="Verifiability"

label_names={'u' : "unverifiable",
'n' : "non-experiential",
'e' : "experiential"}

for cv in ["train","test"]:
    l=[]
    for x in open(f"{disceval_data}/verifiability/{cv}.txt"):
        y=x[0]
        text=" ".join(x[2:].strip().split("#")[:-1])
        l+=[{"sentence":text,"label":label_names[y]}]
    df=pd.DataFrame(l)
    df["sentence"] = df.sentence.map(process)
    
    if cv=="train":
        rnd = np.arange(len(df))/len(df)
        np.random.shuffle(rnd)
        df[rnd<0.9][cols].to_csv(f"{disceval_path}/{task}/train.tsv", sep="\t", index=False)
        df[rnd>0.9][cols].to_csv(f"{disceval_path}/{task}/dev.tsv", sep="\t", index=False)
    else:
        df[cols].to_csv(f"{disceval_path}/{task}/test.tsv", sep="\t", index=False)

labels=set(df.label)
pd.Series(list(labels)).to_csv(f"{disceval_path}/{task}/labels", sep="\t", index=None)



# Emobank

In [20]:
np.random.seed(0)

for c in ["Valence", "Arousal", "Dominance"]:
    df=pd.read_csv(f"{disceval_data}/emobank.csv")
    df["sentence"] = df.text.map(process)
    df["label"] = pd.qcut(df[c[0]], q=3, labels=["low","-","high"])
    df=df[(df.label!="-") & df.sentence.map(lambda x: not x.endswith(","))]

    cv = np.arange(len(df))/len(df)
    np.random.shuffle(cv)
    df = df[cols].sample(frac=1.0)

    labels=set(df.label)
    task=f"EmoBank-{c}"
    pd.Series(list(labels)).to_csv(f"{disceval_path}/{task}/labels", sep="\t", index=None)   

    df[cv<0.8][cols].to_csv(f"{disceval_path}/{task}/train.tsv", sep="\t", index=False)
    df[(0.8<cv) & (cv<0.9)][cols].to_csv(f"{disceval_path}/{task}/dev.tsv", sep="\t", index=False)
    df[0.9<cv][cols].to_csv(f"{disceval_path}/{task}/test.tsv", sep="\t", index=False)

  from ipykernel import kernelapp as app


In [21]:
for path in glob.glob(f"{disceval_path}/*/train.tsv"):
    print(path, len(pd.read_csv(path,sep="\t")))

/data/sileo/libs/git/DiscEval//disceval/PDTB/train.tsv 12907
/data/sileo/libs/git/DiscEval//disceval/GUM/train.tsv 1700
/data/sileo/libs/git/DiscEval//disceval/STAC/train.tsv 11230
/data/sileo/libs/git/DiscEval//disceval/MRDA/train.tsv 14484
/data/sileo/libs/git/DiscEval//disceval/SwitchBoard/train.tsv 18930
/data/sileo/libs/git/DiscEval//disceval/Emergent/train.tsv 2076
/data/sileo/libs/git/DiscEval//disceval/Sarcasm/train.tsv 3754
/data/sileo/libs/git/DiscEval//disceval/Verifiability/train.tsv 5712
/data/sileo/libs/git/DiscEval//disceval/Persuasiveness-Specificity/train.tsv 504
/data/sileo/libs/git/DiscEval//disceval/Persuasiveness-Eloquence/train.tsv 725
/data/sileo/libs/git/DiscEval//disceval/Persuasiveness-Relevance/train.tsv 725
/data/sileo/libs/git/DiscEval//disceval/Persuasiveness-ClaimType/train.tsv 160
/data/sileo/libs/git/DiscEval//disceval/Persuasiveness-PremiseType/train.tsv 566
/data/sileo/libs/git/DiscEval//disceval/Persuasiveness-Strength/train.tsv 371
/data/sileo/libs/