In [1]:
from __future__ import print_function
import educe.rst_dt
from educe.corpus import FileId

# relative to the educe docs directory
data_dir = '../data'
rst_corpus_dir = '{dd}/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0/TEST/'.format(dd=data_dir)

# read and load the documents from the WSJ main directory which were tagged
rst_reader = educe.rst_dt.Reader(rst_corpus_dir)
rst_corpus = rst_reader.slurp(verbose=True)

Slurping corpus dir [38/38 done]


In [2]:
#open a file to output parsed EDU pairs and corresponding relations
import json
import csv
data = {}

f = open('training_data_new.csv', 'w')
writer = csv.writer(f, delimiter = "|")

#set header in the csv file 
header = ['Filename', 'EDU1', 'EDU2', 'Relation']
writer.writerow([g for g in header])

In [3]:
#obtain subtrees and context for each key
for key in rst_corpus.keys():
    ex_key=educe.rst_dt.mk_key(key.doc)
    ex_doc = rst_corpus[ex_key]
    ex_context = ex_doc.label().context
    
    #go through all subtrees and extract relations and edus
    for ex_subtree in ex_doc.subtrees():
        subtree = educe.rst_dt.SimpleRSTTree.from_rst_tree(ex_subtree)
        
        #extract relation
        relation=subtree.label().rel
        satellite = []
        
        #extract edus
        if (relation!='leaf'):
            edus = subtree.leaves()
            nucleus = str(edus[0])
            
            #append all satellites together
            for edu in edus[1:]:
                satellite.append(str(edu).split(")")[1])
            
            #format the first and second text spans
            satellites =''.join(satellite)
            satellites = satellites.replace('\n', ' ')
            nucleus = nucleus.split(")")[1]
            nucleus = nucleus.replace('\n', ' ')
            
            #append the row to the csv file
            writer.writerow([key.doc, nucleus, satellites, relation])


In [6]:
import pandas as pd
import numpy as np

In [7]:
#load the data collected from the discourse trees
df = pd.read_csv('training_data_new.csv', delimiter='|')
df

Unnamed: 0,Filename,EDU1,EDU2,Relation
0,wsj_0632.out,General Motors Corp. and Ford Motor Co. are n...,as GM got early clearance from the Federal Tr...,evidence
1,wsj_0632.out,General Motors Corp. and Ford Motor Co. are n...,as GM got early clearance from the Federal Tr...,interpretation-s
2,wsj_0632.out,General Motors Corp. and Ford Motor Co. are n...,as GM got early clearance from the Federal Tr...,consequence-s
3,wsj_0632.out,General Motors Corp. and Ford Motor Co. are n...,as GM got early clearance from the Federal Tr...,elaboration-additional
4,wsj_0632.out,General Motors Corp. and Ford Motor Co. are n...,as GM got early clearance from the Federal Tr...,circumstance
5,wsj_0632.out,as GM got early clearance from the Federal Tr...,to boost its stake in the British luxury car ...,elaboration-object-attribute-e
6,wsj_0632.out,GM confirmed Friday,that it received permission late Thursday fro...,Sequence
7,wsj_0632.out,GM confirmed Friday,that it received permission late Thursday fro...,Inverted-Sequence
8,wsj_0632.out,GM confirmed Friday,that it received permission late Thursday fro...,attribution
9,wsj_0632.out,that it received permission late Thursday fro...,to increase its Jaguar holdings past the $15 ...,elaboration-object-attribute-e


In [8]:
#map all relations to their relation classes
relation_class = {'NaN': 'no relation', 'TextualOrganization' : 'no relation', 'Same-Unit' : 'no relation', 'attribution-e' : 'attribution', 'attribution' : 'attribution', 'attribution-n' : 'attribution', 'Cause-Result':'cause', 'attribution-negative':'attribution','background':'background', 'background-e' : 'background', 'circumstance' : 'background', 'circumstance-e' :'background', 'cause' : 'cause', 'result' : 'cause', 'result-e' : 'cause', 'Consequence' : 'cause', 'consequence-n': 'cause', 'consequence-s' : 'cause','consequence-n-e' :'cause', 'consequence-s-e': 'cause', 'Comparison' : 'comparison', 'comparison' :'comparison', 'comparison-e': 'comparison', 'preference': 'comparison','preference-e': 'comparison', 'analogy' : 'comparison', 'analogy-e': 'comparison', 'Analogy': 'comparison', 'Proportion' : 'comparison','condition' : 'condition', 'condition-e' : 'condition', 'hypothetical' : 'condition', 'contingency' : 'condition', 'otherwise': 'condition', 'Otherwise' : 'condition', 'Contrast' : 'contrast', 'concession': 'contrast', 'concession-e' : 'contrast', 'antithesis' : 'contrast', 'antithesis-e': 'contrast', 'elaboration-additional' : 'elaboration', 'elaboration-additional-e' : 'elaboration', 'elaboration-general-specific' : 'elaboration', 'elaboration-general-specific-e' : 'elaboration', 'elaboration-object-attribute' :'elaboration', 'elaboration-part-whole' : 'elaboration', 'elaboration-part-whole-e' : 'elaboration','definition-e': 'elaboration', 'elaboration-process-step-e' : 'elaboration', 'elaboration-process-step' : 'elaboration', 'elaboration-object-attribute-e' : 'elaboration', 'elaboration-set-member' : 'elaboration', 'elaboration-set-member-e' :'elaboration', 'example' : 'elaboration','example-e' : 'elaboration', 'definition' : 'elaboration',  'purpose-e': 'enablement', 'enablement-e': 'enablement', 'purpose' : 'enablement', 'enablement' : 'enablement', 'evaluation' : 'evaluation', 'evaluation-s' : 'evaluation', 'evaluation-n' : 'evaluation', 'Evaluation': 'evaluation', 'Interpretation' :'evaluation', 'interpretation-s-e' : 'evaluation',  'interpretation-s': 'evaluation', 'interpretation-n' : 'evaluation', 'conclusion' : 'evaluation', 'evaluation-s-e': 'evaluation', 'comment' : 'evaluation', 'comment-e' : 'evaluation', 'evidence': 'explanation', 'explanation-argumentative' : 'explanation', 'explanation-argumentative-e': 'explanation', 'Reason': 'explanation', 'reason-e' : 'explanation', 'reason' : 'explanation','evidence-e': 'explanation', 'List' : 'joint', 'Disjunction' : 'joint', 'manner' : 'manner-means', 'manner-e' : 'manner-means', 'means-e': 'manner-means', 'means' : 'manner-means','Problem-Solution' : 'topic-comment', 'problem-solution-e': 'topic-comment', 'problem-solution-s': 'topic-comment', 'problem-solution-n': 'topic-comment', 'problem-solution-n' : 'topic-comment', 'Topic-Comment': 'topic-comment', 'summary-n' : 'summary', 'summary-s' : 'summary', 'question-answer-s' : 'topic-comment', 'question-answer-n': 'topic-comment', 'Question-Answer' : 'topic-comment', 'Statement-Response' : 'topic-comment', 'Topic-Comment' :'topic-comment', 'rhetorical-question' : 'topic-comment', 'restatement-e' : 'summary','Comment-Topic': 'topic-comment', 'statement-response-n': 'topic-comment', 'statement-response-s': 'topic-comment', 'statement-response-e': 'topic-comment',  'restatement': 'summary',  'temporal-before' : 'temporal', 'temporal-after' : 'temporal', 'temporal-after-e' : 'temporal', 'Temporal-Same-Time' : 'temporal', 'temporal-same-time' :'temporal', 'temporal-same-time-e' : 'temporal', 'Sequence' : 'temporal', 'Inverted-Sequence' : 'temporal', 'Topic-Shift' : 'topic-change', 'Topic-Drift' : 'topic-change', 'topic-drift': 'topic-change', 'temporal-before-e': 'temporal', 'topic-drift-e': 'topic-change', 'topic-shift': 'topic-change'}

#set the relation classes
df["Relation Class"] = df["Relation"].map(relation_class)

#export the preprocessed data to a csv file
export_csv = df.to_csv (r'processed_training_data-f.csv', index = None, header=True, sep = '|')

df

Unnamed: 0,Filename,EDU1,EDU2,Relation,Relation Class
0,wsj_0632.out,General Motors Corp. and Ford Motor Co. are n...,as GM got early clearance from the Federal Tr...,evidence,explanation
1,wsj_0632.out,General Motors Corp. and Ford Motor Co. are n...,as GM got early clearance from the Federal Tr...,interpretation-s,evaluation
2,wsj_0632.out,General Motors Corp. and Ford Motor Co. are n...,as GM got early clearance from the Federal Tr...,consequence-s,cause
3,wsj_0632.out,General Motors Corp. and Ford Motor Co. are n...,as GM got early clearance from the Federal Tr...,elaboration-additional,elaboration
4,wsj_0632.out,General Motors Corp. and Ford Motor Co. are n...,as GM got early clearance from the Federal Tr...,circumstance,background
5,wsj_0632.out,as GM got early clearance from the Federal Tr...,to boost its stake in the British luxury car ...,elaboration-object-attribute-e,elaboration
6,wsj_0632.out,GM confirmed Friday,that it received permission late Thursday fro...,Sequence,temporal
7,wsj_0632.out,GM confirmed Friday,that it received permission late Thursday fro...,Inverted-Sequence,temporal
8,wsj_0632.out,GM confirmed Friday,that it received permission late Thursday fro...,attribution,attribution
9,wsj_0632.out,that it received permission late Thursday fro...,to increase its Jaguar holdings past the $15 ...,elaboration-object-attribute-e,elaboration


In [27]:
#view counts of all the relation classes 
print(df['Relation Class'].value_counts())


elaboration      796
attribution      343
contrast         146
joint            140
no relation      133
background       111
explanation      110
cause             82
evaluation        80
temporal          68
condition         48
enablement        46
summary           32
comparison        29
manner-means      27
topic-comment     24
topic-change       7
Name: Relation Class, dtype: int64
