In [1]:
import pandas as pd
from collections import Counter
import json, os
from sklearn.model_selection import train_test_split

## Download data using MDL-stance-script
https://github.com/UKPLab/mdl-stance-robustness/blob/master/download.sh

In [14]:
# !wget https://raw.githubusercontent.com/UKPLab/mdl-stance-robustness/master/download.sh

In [15]:
# !mv download.sh data/download.sh

In [16]:
# ! sh data/download.sh

## ArgMin

request data here: https://tudatalib.ulb.tu-darmstadt.de/handle/tudatalib/2345


In [17]:
# !unzip data/ArgMin/UKP_sentential_argument_mining.zip

In [18]:
list_files = list()
for filename in [filename for filename in os.listdir("data/ArgMin/") if filename.endswith(".tsv")]:
    print(filename)
    list_files.append(pd.read_csv("data/ArgMin/"+filename, sep="\t", quotechar="'"))

df_argmin = pd.concat(list_files, ignore_index=True)
df_argmin["uid"] = range(0,len(df_argmin))


nuclear_energy.tsv
school_uniforms.tsv
cloning.tsv
death_penalty.tsv
abortion.tsv
gun_control.tsv
minimum_wage.tsv
marijuana_legalization.tsv


In [19]:
topic_dict = {'marijuana legalization': 'Marijuana should be legalized .',
             'school uniforms': 'School uniforms should be permitted .',
             'nuclear energy': 'Nuclear energy should be permitted .',
             'minimum wage': 'Minimum wage should be permitted .',
             'gun control': 'Gun control should be permitted .',
             'death penalty': 'Death penalty should be permitted .',
             'cloning': 'Cloning should be permitted .',
             'abortion': 'Abortion should be permitted .'}

In [20]:
set(df_argmin["topic"])

{'abortion',
 'cloning',
 'death penalty',
 'gun control',
 'marijuana legalization',
 'minimum wage',
 'nuclear energy',
 'school uniforms'}

In [21]:
df_argmin["topic"] = df_argmin["topic"].apply(lambda x: topic_dict.get(x))

In [22]:
df_argmin = df_argmin[df_argmin["annotation"]!= "NoArgument"]

In [23]:
set(df_argmin["topic"])

{'Abortion should be permitted .',
 'Cloning should be permitted .',
 'Death penalty should be permitted .',
 'Gun control should be permitted .',
 'Marijuana should be legalized .',
 'Minimum wage should be permitted .',
 'Nuclear energy should be permitted .',
 'School uniforms should be permitted .'}

In [24]:
df_argmin.loc[df_argmin['annotation'] == 'Argument_against', 'annotation'] = 0
df_argmin.loc[df_argmin['annotation'] == 'Argument_for', 'annotation'] = 1

In [25]:
df_argmin = df_argmin[["uid", "annotation", "topic", "sentence", "set"]]
df_argmin.rename(columns={"annotation":"stance", "sentence":"premise", "topic": "claim"}, inplace=True)
df_argmin

Unnamed: 0,uid,stance,claim,premise,set
3,3,1,Nuclear energy should be permitted .,Fossil fuels receive large direct and indirect...,train
8,8,0,Nuclear energy should be permitted .,The number of U.S. reactors shut down for a ye...,val
9,9,0,Nuclear energy should be permitted .,"After the accident at Three Mile Island , buil...",val
10,10,0,Nuclear energy should be permitted .,As the nuclear power issue has been widely deb...,train
14,14,1,Nuclear energy should be permitted .,"For all this , it 's worth noting that uranium...",train
...,...,...,...,...,...
25479,25479,1,Marijuana should be legalized .,Prohibition has robbed children of their futur...,train
25481,25481,0,Marijuana should be legalized .,Legalizing marijuana would also lead to negati...,train
25482,25482,0,Marijuana should be legalized .,"Contrary to what is stated above , marijuana u...",test
25486,25486,0,Marijuana should be legalized .,Some of the most common ones include dry mouth...,train


In [26]:
Counter(df_argmin["stance"])

Counter({0: 6195, 1: 4944})

In [27]:
df_argmin.reset_index(inplace=True)

In [28]:
df_argmin.to_csv("data/argmin_all.csv")
# df_argmin[df_argmin["set"]=="train"].to_csv("data/argmin_train.csv")
# df_argmin[df_argmin["set"]=="dev"].to_csv("data/argmin_dev.csv")
# df_argmin[df_argmin["set"]=="test"].to_csv("data/argmin_test.csv")

## IBM claim Stance

request data here: https://research.ibm.com/haifa/dept/vst/debating_data.shtml#Argument_Stance

In [29]:
ibmcs = pd.read_csv("data/IBM_CLAIM_STANCE/claim_stance_dataset_v1.csv")

In [30]:
ibmcs["uid"] = range(0,len(ibmcs))
ibmcs = ibmcs[["uid", "claims.stance", "topicText", "claims.claimCorrectedText", "split"]]
ibmcs.rename(columns={"claims.stance":"stance", "topicText":"claim","claims.claimCorrectedText": "premise", "split": "set"}, inplace=True)
ibmcs

Unnamed: 0,uid,stance,claim,premise,set
0,0,PRO,This house believes that the sale of violent v...,Exposure to violent video games causes at leas...,test
1,1,CON,This house believes that the sale of violent v...,video game violence is not related to serious ...,test
2,2,CON,This house believes that the sale of violent v...,some violent video games may actually have a p...,test
3,3,PRO,This house believes that the sale of violent v...,exposure to violent video games causes both sh...,test
4,4,PRO,This house believes that the sale of violent v...,Violent video games increase the violent tende...,test
...,...,...,...,...,...
2389,2389,CON,This house would promote democratization,democracies have ever been found incompatible ...,train
2390,2390,CON,This house would promote democratization,democracy cannot subsist long nor be carried f...,train
2391,2391,CON,This house would promote democratization,Democracy in general is criticized for ignorin...,train
2392,2392,PRO,This house would promote democratization,democracy and freedom are indispensable ingred...,train


In [31]:
ibmcs.loc[ibmcs['stance'] == 'PRO', 'stance'] = 1
ibmcs.loc[ibmcs['stance'] == 'CON', 'stance'] = 0

In [32]:
train, dev = train_test_split(ibmcs[ibmcs["set"]=="train"], test_size=0.1, random_state=42)
dev

Unnamed: 0,uid,stance,claim,premise,set
1495,1495,0,This house believes that Israel's 2008-2009 mi...,Israel disproportionately attacked civilians,train
2164,2164,1,This house would never have children,the human population is far greater than the E...,train
1494,1494,1,This house believes that Israel's 2008-2009 mi...,Hamas is guilty of war crimes,train
578,578,1,This house believes that the Catholic Church i...,"the sexual act must ""retain its intrinsic rela...",train
1899,1899,0,This house would implement playoffs in collegi...,numerous FBS Conferences have expressed their ...,train
...,...,...,...,...,...
731,731,0,This house would fund education using a vouche...,public education would be destroyed by tuition...,train
1465,1465,0,This house believes that Israel's 2008-2009 mi...,the livelihoods and assets of tens of thousand...,train
2154,2154,0,This house would never have children,there ought to be a higher rate of population ...,train
2373,2373,1,This house would promote democratization,majority rule is preferable to other systems,train


In [33]:
ibmcs.loc[dev.index, "set"] = "dev"

In [34]:
Counter(ibmcs["set"])

Counter({'test': 1355, 'train': 935, 'dev': 104})

In [35]:
len(ibmcs[ibmcs["stance"]==1])/len(ibmcs)

0.5534670008354219

In [36]:
ibmcs.to_csv("data/ibmcs_all.csv")

In [37]:
# ibmcs[ibmcs["set"]=="train"].to_csv("data/ibmcs_train.csv")
# ibmcs[ibmcs["set"]=="dev"].to_csv("data/ibmcs_dev.csv")
# ibmcs[ibmcs["set"]=="test"].to_csv("data/ibmcs_test.csv")

## Perspectrum

In [38]:
# https://github.com/UKPLab/mdl-stance-robustness/blob/master/data_utils/glue_utils.py#L647
with open("data/PERSPECTRUM/dataset_split_v1.0.json", "r") as split_in, \
        open("data/PERSPECTRUM/perspectrum_with_answers_v1.0.json","r") as claims_in, \
        open("data/PERSPECTRUM/perspective_pool_v1.0.json", "r") as perspectives_in:

        # load files
        data_split = json.load(split_in)
        claims = json.load(claims_in)
        perspectives = json.load(perspectives_in)

# lookup for perspective ids
perspectives_dict = {}
for p in perspectives:
    perspectives_dict[p['pId']] = p['text']

# init
X_train, X_dev, X_test = [], [], []
y_train, y_dev, y_test = [], [], []

count_sup_cluster = 0
count_und_cluster = 0

# fill train/dev/test
output_list = list()
for claim in claims:
    cId = str(claim['cId'])
    for p_cluster in claim['perspectives']:
        if p_cluster['stance_label_3'] == "SUPPORT":
            cluster_label = 1
        else:
            cluster_label = 0
        for pid in p_cluster['pids']:
            # print(pid, cId, cluster_label, claim["text"],perspectives_dict[pid], data_split[cId]) 
            output_list.append({"uid": pid, "stance": cluster_label, "claim": claim["text"], "premise": perspectives_dict[pid], "set": data_split[cId]})
        #     if data_split[cId] == 'train':
        #         X_train.append((claim['text'], perspectives_dict[pid]))
        #         y_train.append(cluster_label)
        #     elif data_split[cId] == 'dev':
        #         X_dev.append((claim['text'], perspectives_dict[pid]))
        #         y_dev.append(cluster_label)
        #     elif data_split[cId] == 'test':
        #         X_test.append((claim['text'], perspectives_dict[pid]))
        #         y_test.append(cluster_label)
        #     else:
        #         print("Incorrect set type: "+data_split[claim['cId']])
        # if cluster_label == 1:
        #     count_sup_cluster += 1
        # if cluster_label == 0:
        #     count_und_cluster += 1


perspectrum = pd.DataFrame(output_list)

# print(X_train, y_train, X_dev, y_dev, X_test, y_test)

In [39]:
Counter(perspectrum["set"])

Counter({'train': 6978, 'test': 2773, 'dev': 2071})

In [40]:
perspectrum.to_csv("data/perspectrum_all.csv")
# perspectrum[perspectrum["set"]=="train"].to_csv("data/perspectrum_train.csv")
# perspectrum[perspectrum["set"]=="dev"].to_csv("data/perspectrum_dev.csv")
# perspectrum[perspectrum["set"]=="test"].to_csv("data/perspectrum_test.csv")