In [58]:
import glob
import os
import re

import numpy as np
import pandas as pd

import spacy
spacy.load('en')
from spacy.lang.en import English

In [59]:
path = 'rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0/TRAINING/'

In [60]:
""" Load Data
    
    Outputs
    ----------
    data_list : list within list
        outer list contains documents
        inner list contains edus in each document
        [[edu_0, edu_1],[edu_0, edu_1]]
    
    edu_filenames : filenames corresponeding to outer list of data_list

"""

filename_list = []
data_list = []
innerlist = []

# define desired replacements here
rep = {'"': '', '(': '', ')':'', '. . .':' ', '**':'', 
       'Prof.':'Prof', '-- ':'', 'Alex. Brown & Sons':'Alex-Brown & Sons', 'No.':'number', 'G.m.b. H.':'GmbH',
       'Canada First!':'Canada First', 'Cos.':'Companies', 'Sens.':'Senators', 'Col.':'Colonel','Prop.':'Proposition',
       '` The Art of the Deal.':'\'The Art of the Deal.\'','\' Why doesn':'Why doesn',
       '`dolce! dolce!\' {`sweet! sweet!\'}.':'\'dolce dolce sweet sweet\'', 'S.p. A.':'SpA', 'Pty.':'Proprietary',
       'Westamerica Bancorp.':'Westamerica Bancorporation', 'Ing. C. Olivetti & Co.':'Olivetti SpA',
       'Tex.':'Texas', 'Cie.':'Compagnie', 'Sept. 30-Oct. 4':'Sept. 30-October 4', 
       'Mort! Mort! Mort!':'\'Mort Mort Mort\'',
       '1. Use':'Use', '2. Provide':'Provide','3. Record':'Record', '4. Impose':'Impose',
      '1. As part':'As part', '2. In seeking':'In seeking', '3. In deciding':'In deciding', '4. When a RICO':'When a RICO'} 
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))

# for each file in path folder
for filename in glob.glob(os.path.join(path, '*.edus')):
    filename_list.append(filename)
    # open file
    with open(filename) as my_file: 
        for line in my_file:
            text = line
            if ("contributed to this article") in text or ("Year ago figure is restated" in text):
                continue
            text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
#             innerlist.append(((line.lstrip()).rstrip()).replace('"',''))
#             innerlist.append((line.lstrip()).rstrip())
            innerlist.append((text.lstrip()).rstrip())
    data_list.append(innerlist)
    innerlist = []
    
edu_filenames = [s.strip('rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0/TRAINING/') for s in filename_list]

In [61]:
print("Files processed: ", len(edu_filenames))
print("Data length: ", len(data_list))

Files processed:  347
Data length:  347


In [62]:
# See Example
print(edu_filenames[1])
data_list[1]

wsj_0668


['ABB Asea Brown Boveri B.V. said',
 'it signed a contract for the largest-ever power plant order in the Netherlands.',
 'ABB said',
 'the contract,',
 'signed with the Dutch utility N.V. Energieproduktiebedrijf UNA,',
 'is valued in excess of $200 million.',
 'The accord is for a turbogenerator plant at the coal-fired power station Hemweg in Amsterdam.',
 'ABB Asea Brown Boveri is the Dutch unit of the Swedish-Swiss electrical engineering group ABB Asea Brown Boveri AG.',
 'ABB said',
 'a significant portion of the order will be placed with Dutch subcontractors,',
 'adding',
 'that a group has been set up for this purpose.',
 'The Dutch utility firm serves the Amsterdam and Utrecht areas.',
 'The planned turbogenerator plant is expected to go into operation in 1994.']

In [63]:
' '.join(data_list[1])

'ABB Asea Brown Boveri B.V. said it signed a contract for the largest-ever power plant order in the Netherlands. ABB said the contract, signed with the Dutch utility N.V. Energieproduktiebedrijf UNA, is valued in excess of $200 million. The accord is for a turbogenerator plant at the coal-fired power station Hemweg in Amsterdam. ABB Asea Brown Boveri is the Dutch unit of the Swedish-Swiss electrical engineering group ABB Asea Brown Boveri AG. ABB said a significant portion of the order will be placed with Dutch subcontractors, adding that a group has been set up for this purpose. The Dutch utility firm serves the Amsterdam and Utrecht areas. The planned turbogenerator plant is expected to go into operation in 1994.'

In [64]:
""" For each document --> converts edus in each document to paragraphs, then sentences
    
    Outputs
    ----------
    data_sentences : list within list
        outer list contains documents
        inner list contains the sentences in each document
        [[sentence_1, sentence_1],[sentence_0, sentence_1]]
"""
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
# doc = nlp('Iterate over the sentences in the document. Sentence spans have no label. To improve accuracy on informal texts, spaCy calculates sentence boundaries from the syntactic dependency parse. If the parser is disabled, the sents iterator will be unavailable.')
# https://github.com/explosion/spaCy/issues/2708

data_sentences = []
for document in data_list:
    doc = nlp(' '.join(document))
    doc_sentences = []
    for sent in doc.sents:
        doc_sentences.append(sent.text)
    data_sentences.append(doc_sentences)

In [65]:
len(data_sentences)

347

In [67]:
""" For each document --> get list of edus/sentence
    
    Outputs
    ----------
    data_doc_edus : list within list within list
        outer list contains documents
        middle list contains the sentences in each document
        inner list contains edus in each sentence 
        [document1[sentence1[edu_0, edu_1],sentence2[edu_0, edu_1]], document2[[],[]]]
"""
doc_counter = 0
sent_counter = 0
data_doc_edus = []
notworking_counter = 0
listdidntwork = []
for doc_counter, doc in enumerate(data_sentences):
#     print("------------------- DOCUMENT: ", doc_counter, " -------------------")
#     print(" ")
    edus_in_doc = data_list[doc_counter].copy()
    doc_edus = []
    for sent_counter, sentence in enumerate(doc):
        didntwork = False            
#         print('SENTENCE ',sent_counter,': ',sentence)
#         print(" ")
        edu_check = []
        edu_counter = 0
        for edu_counter, edu in enumerate(edus_in_doc):
            edu_check.append(edu)
            if ' '.join(edu_check) == sentence:
                for item in edu_check:
                    edus_in_doc.remove(item)
#                     print('SENTENCE:', edu_counter, item)
#                 print(edu_check)
                doc_edus.append(edu_check)
                break
            if edu_counter == (len(edus_in_doc)-1):
#                 print("ALL EDUS CONSIDERED")
                didntwork = True
                notworking_counter+=1
        if sentence.endswith('"'):
            listdidntwork.append([sent_counter, sentence, didntwork])
#         print(" ")
#         print(" ")
    data_doc_edus.append(doc_edus)

In [68]:
notworking_counter

0

In [69]:
len(data_doc_edus)

347

In [70]:
sentence_counter = 0
document_counter = 0
row_data = []
document_testing = []
for document_counter, document in enumerate(data_doc_edus):
    for sentence_counter, sent in enumerate(document):
        for first, second in zip(sent, sent[1:]):
            document_testing.append(document_counter)
            row_data.append([edu_filenames[document_counter],document_counter, sentence_counter, first, second])

In [71]:
headers = ['File','Document', 'Sentence', 'EDU1', 'EDU2']

In [72]:
df_data = pd.DataFrame(row_data, columns=headers)
df_data

Unnamed: 0,File,Document,Sentence,EDU1,EDU2
0,wsj_0669,0,0,Nissan Motor Co. expects net income to reach 1...,U.S. $857 million
1,wsj_0669,0,0,U.S. $857 million,"in its current fiscal year, up from 114.6 bill..."
2,wsj_0669,0,0,"in its current fiscal year, up from 114.6 bill...","Yutaka Kume, president, said."
3,wsj_0669,0,1,Mr. Kume made the earnings projection for fisc...,"ending next March 31,"
4,wsj_0669,0,1,"ending next March 31,",in an interview with U.S. automotive writers
5,wsj_0669,0,1,in an interview with U.S. automotive writers,attending the Tokyo Motor Show.
6,wsj_0669,0,2,The executive said,that the anticipated earnings increase is fair...
7,wsj_0669,0,2,that the anticipated earnings increase is fair...,because Nissan is spending heavily
8,wsj_0669,0,2,because Nissan is spending heavily,to bolster its dealership network in Japan
9,wsj_0669,0,2,to bolster its dealership network in Japan,and because of currency-exchange fluctuations.


In [518]:
df_data.to_csv('EDU_pairs_TRAINING.csv', index=False)

#### Load Data/Corpus Statistics

In [15]:
results = pd.read_csv('EDU_pairs_TRAINING.csv')

In [10]:
newdf = results.loc[results.Document == 1]

In [11]:
newdf.reset_index(drop=True)

Unnamed: 0,File,Document,Sentence,EDU1,EDU2
0,wsj_0668,1,0,ABB Asea Brown Boveri B.V. said,it signed a contract for the largest-ever powe...
1,wsj_0668,1,1,ABB said,"the contract,"
2,wsj_0668,1,1,"the contract,",signed with the Dutch utility N.V. Energieprod...
3,wsj_0668,1,1,signed with the Dutch utility N.V. Energieprod...,is valued in excess of $200 million.
4,wsj_0668,1,4,ABB said,a significant portion of the order will be pla...
5,wsj_0668,1,4,a significant portion of the order will be pla...,adding
6,wsj_0668,1,4,adding,that a group has been set up for this purpose.


In [13]:
"ABB Asea Brown Boveri B.V. said it signed a contract for the largest-ever power plant order in the Netherlands."
"ABB said the contract, signed with the Dutch utility N.V. Energieproduktiebedrijf UNA, is valued in excess of $200 million."
"The accord is for a turbogenerator plant at the coal-fired power station Hemweg in Amsterdam." 
"ABB Asea Brown Boveri is the Dutch unit of the Swedish-Swiss electrical engineering group ABB Asea Brown Boveri AG." 
"ABB said a significant portion of the order will be placed with Dutch subcontractors, adding that a group has been set up for this purpose." 
"The Dutch utility firm serves the Amsterdam and Utrecht areas. "
"The planned turbogenerator plant is expected to go into operation in 1994."

'The Dutch utility firm serves the Amsterdam and Utrecht areas. The planned turbogenerator plant is expected to go into operation in 1994.'

In [25]:
results.File.describe()

count       12083
unique        308
top       wsj_116
freq          187
Name: File, dtype: object

In [56]:
results.Sentence.describe()

count    12083.000000
mean        18.720930
std         17.679687
min          0.000000
25%          5.000000
50%         14.000000
75%         27.000000
max        101.000000
Name: Sentence, dtype: float64

In [23]:
len(results.Document.unique())

342

In [24]:
len(results.Document.unique())

342

In [44]:
alldocs = list(np.arange(347))

In [47]:
docsinfinal = list(set(document_testing))

In [50]:
len(docsinfinal)

342

In [54]:
docs_not_in_final = [item for item in alldocs if item not in docsinfinal]

In [55]:
for fileix in docs_not_in_final:
    print(edu_filenames[fileix])

wsj_1132
wsj_1117
wsj_1369
wsj_1127
wsj_118
