Here we will extract the data from 315 files which are found across three difference databases:  
All files are in xml format  
We will use BeautifulSoup to extract the data


In [1]:
import pickle
from bs4 import BeautifulSoup

In [2]:
##This is the general structure of the relevant corpora and the fields that will be useful:
# ParCor (http://opus.lingfil.uu.se/ParCor/)

# <word id="word_1">Social</word>

# <markable 
# id="markable_10849" 
# span="word_30477..word_30480" position="none"  coref_class="empty"  mmax_level="coref"  
# nptype="np"  agreement="none"  
# mention="np"  type="none"  sub_apposition="none" />

# ACLPro (http://www.aclweb.org/anthology/C12-2103)

# <word id="word_1">Social</word>

# <markable 
# id="markable_825" 
# span="word_1438..word_1443" 
# coref_class="set_114"  mmax_level="coref"  sure="yes"  
# np_form="ne" />


# WikiPro (http://rali.iro.umontreal.ca/rali/?q=en/wikicoref)

# <word id="word_1">Aberfoyle</word>

# <markable 
# id="markable_318" 
# span="word_1..word_1" 
# coref_class="set_47" topic="http://rdf.freebase.com/ns/m.010kl" coreftype="ident" 
# mentiontype="ne"  mmax_level="coref" />


## Creating lists of all the file names to easily scrape them

Two types of files needed, the word files and the annotation files

In [3]:
parcor_EU = !ls Kojak_Databases/ParCorPro/Annotated_Texts/EU_Bookshop/English/Annotator1
parcor_TED_words = !ls Kojak_Databases/ParCorPro/Annotated_Texts/TED/English/Annotator1/Basedata
parcor_TED_ann = !ls Kojak_Databases/ParCorPro/Annotated_Texts/TED/English/Annotator1/Markables/Coref

In [4]:
parcor_EU

['KEBC11002ENE',
 'KEBC12001ENE',
 'KH7911105ENE',
 'MI3112464ENE',
 'MJ3011331ENE',
 'NA3211776ENE',
 'QE3011322ENE',
 'QE3211790ENE']

In [5]:
parcor_word_files = []
parcor_ann_files = []

for file in parcor_EU:
    word_files = 'Kojak_Databases/ParCorPro/Annotated_Texts/EU_Bookshop/English/Annotator1/'+file+'/Basedata/'+file+'_words.xml'
    ann_files = 'Kojak_Databases/ParCorPro/Annotated_Texts/EU_Bookshop/English/Annotator1/'+file+'/Markables/'+file+'_coref_level.xml'
    parcor_word_files.append(word_files)
    parcor_ann_files.append(ann_files)
    
for file in parcor_TED_words:
    word_files = 'Kojak_Databases/ParCorPro/Annotated_Texts/TED/English/Annotator1/Basedata/'+file
    parcor_word_files.append(word_files)
    
for file in parcor_TED_ann:
    ann_files = 'Kojak_Databases/ParCorPro/Annotated_Texts/TED/English/Annotator1/Markables/Coref/'+file
    parcor_ann_files.append(ann_files)

In [6]:
aclpro_C = !ls Kojak_Databases/ACLPro/annotation/C
aclpro_D = !ls Kojak_Databases/ACLPro/annotation/D
aclpro_P = !ls Kojak_Databases/ACLPro/annotation/P

In [7]:
aclpro_word_files = []
aclpro_ann_files = []

for file in aclpro_C:
    word_files = 'Kojak_Databases/ACLPro/annotation/C/'+file+'/Basedata/'+file+'_words.xml'
    ann_files = 'Kojak_Databases/ACLPro/annotation/C/'+file+'/Markables/'+file+'_coref_level.xml'
    aclpro_word_files.append(word_files)
    aclpro_ann_files.append(ann_files)


for file in aclpro_D:
    word_files = 'Kojak_Databases/ACLPro/annotation/D/'+file+'/Basedata/'+file+'_words.xml'
    ann_files = 'Kojak_Databases/ACLPro/annotation/D/'+file+'/Markables/'+file+'_coref_level.xml'
    aclpro_word_files.append(word_files)
    aclpro_ann_files.append(ann_files)


for file in aclpro_P:
    word_files = 'Kojak_Databases/ACLPro/annotation/P/'+file+'/Basedata/'+file+'_words.xml'
    ann_files = 'Kojak_Databases/ACLPro/annotation/P/'+file+'/Markables/'+file+'_coref_level.xml'
    aclpro_word_files.append(word_files)
    aclpro_ann_files.append(ann_files)


In [8]:
wikipro = !ls Kojak_Databases/WikiPro/Annotation

In [9]:
wikipro_word_files = []
wikipro_ann_files = []

for file in wikipro:
    word_files = 'Kojak_Databases/WikiPro/Annotation/'+file+'/Basedata/'+file+'_words.xml'
    ann_files = 'Kojak_Databases/WikiPro/Annotation/'+file+'/Markables/'+file+'_coref_level.xml'
    wikipro_word_files.append(word_files)
    wikipro_ann_files.append(ann_files)

In [10]:
wikipro_word_files[0]

'Kojak_Databases/WikiPro/Annotation/Aberfoyle, Stirling/Basedata/Aberfoyle, Stirling_words.xml'

## Scraping the data from all the files 

In [13]:
parcor_all_words = {}
j = 1
for file in parcor_word_files:
    parcor_all_words[j] = {}
    word_list_xml = BeautifulSoup(open(file),"xml")
    k = 1
    for word in word_list_xml.findAll('word'):
        parcor_all_words[j][k] = {'word': word.text,
                                  'place': "word_" + str(k),
                                  'doc':j}
        k += 1
    j += 1

In [14]:
aclpro_all_words = {}
j = 1
for file in aclpro_word_files:
    aclpro_all_words[j] = {}
    word_list_xml = BeautifulSoup(open(file),"xml")
    k = 1
    for word in word_list_xml.findAll('word'):
        aclpro_all_words[j][k] = {'word': word.text,
                                  'place': "word_" + str(k),
                                  'doc':j}
        k += 1
    j += 1

In [15]:
wikipro_all_words = {}
j = 1
for file in wikipro_word_files:
    wikipro_all_words[j] = {}
    word_list_xml = BeautifulSoup(open(file),"xml")
    k = 1
    for word in word_list_xml.findAll('word'):
        wikipro_all_words[j][k] = {'word': word.text,
                                  'place': "word_" + str(k),
                                  'doc':j}
        k += 1
    j += 1

In [16]:
#This is the structure of parcor
# # <markable 
# # id="markable_10849" 
# # span="word_30477..word_30480" 
# position="none"  
# coref_class="empty" 
# mmax_level="coref"  
# # nptype="np"  
# agreement="none"  
# # mention="np"  
# type="none" 
# sub_apposition="none" />

In [17]:
parcor_all_ann = {}
j = 1
for file in parcor_ann_files:
    parcor_all_ann[j] = {}
    ann_list_xml = BeautifulSoup(open(file),"xml")
    i = 1
    for ref in ann_list_xml.findAll('markable'):
        this = {"id":'',"span":'',"coref_class":'',
                "mmax_level":'',"nptype":'',"agreement":'',"mention":'',"type":'',"sub_apposition":''}

        this["id"] = ref['id']
        this["span"] = ref['span']
        this["coref_class"] = ref['coref_class']
        this["mmax_level"] = ref['mmax_level']
        this["mention"] = ref['mention']
        this['first_span'] = ref['span'].split('..')[0] #I am taking the first word of the span, to have one to one
        this['doc'] = j
        try:
            this["nptype"]= ref['nptype']
        except:
            pass
        try:
            this["agreement"]= ref['agreement']
        except:
            pass           
        try:
            this["type"]= ref['type']
        except:
            pass
        try:
            this["sub_apposition"] = ref['sub_apposition']
        except:
            pass                
 
        parcor_all_ann[j][i] = this
        i += 1
    j += 1

In [18]:
#this is the structure of aclpro
# <markable 
# id="markable_67" 
# span="word_656..word_657" 
# coref_class="set_3"  
# mmax_level="coref"  
# sure="yes"  
# np_form="def-np" />

In [19]:
aclpro_all_ann = {}
j = 1
for file in aclpro_ann_files:
    aclpro_all_ann[j] = {}
    ann_list_xml = BeautifulSoup(open(file),"xml")
    i = 1
    for ref in ann_list_xml.findAll('markable'):
        this = {}

        this["id"] = ref['id']
        this["span"] = ref['span']
        this["coref_class"] = ref['coref_class']
        this["mmax_level"] = ref['mmax_level']
        this["sure"] = ref['sure']
        this["np_form"] = ref['np_form'] 
        this['first_span'] = ref['span'].split('..')[0] #I am taking the first word of the span, to have one to one
        this['doc'] = j
        
        aclpro_all_ann[j][i] = this
        i += 1
    j += 1

In [21]:
aclpro_all_ann[1][1]

{'coref_class': 'set_114',
 'doc': 1,
 'first_span': 'word_1438',
 'id': 'markable_825',
 'mmax_level': 'coref',
 'np_form': 'ne',
 'span': 'word_1438..word_1443',
 'sure': 'yes'}

In [22]:
#this is the structure of wikipro
# <markable 
# id="markable_318" 
# span="word_1..word_1" 
# coref_class="set_47" 
# topic="http://rdf.freebase.com/ns/m.010kl" 
# coreftype="ident" 
# mentiontype="ne" 
# mmax_level="coref" />

In [23]:
wikipro_all_ann = {}
j = 1
for file in wikipro_ann_files:
    wikipro_all_ann[j] = {}
    ann_list_xml = BeautifulSoup(open(file),"xml")
    i = 1
    for ref in ann_list_xml.findAll('markable'):
        this = {"id":'',"span":'',"coref_class":'',"topic":'',"coreftype":'',"mentiontype":'',"mmax_level":''}
        this['doc'] = j
        this["id"] = ref['id']
        this["span"] = ref['span']
        this["coref_class"] = ref['coref_class']
        this["topic"] = ref['topic']
        this["coreftype"] = ref['coreftype']
        this["mentiontype"] = ref['mentiontype']
        this["mmax_level"] = ref['mmax_level']        
        this['first_span'] = ref['span'].split('..')[0] #I am taking the first word of the span, to have one to one
        wikipro_all_ann[j][i] = this
        i += 1
    j += 1

In [24]:
wikipro_all_ann[9][10]

{'coref_class': 'set_14',
 'coreftype': 'ident',
 'doc': 9,
 'first_span': 'word_93',
 'id': 'markable_334',
 'mentiontype': 'pro',
 'mmax_level': 'coref',
 'span': 'word_93..word_93',
 'topic': 'nan'}

## Combining all the data and dumping into a pickle file

In [25]:
all_data = {1:wikipro_all_ann,2:wikipro_all_words,3:parcor_all_ann,4:parcor_all_words,5:aclpro_all_ann,6:aclpro_all_words}

In [26]:
#pickle all of the data that we stored into the all_data dictionary
with open('all_data.pkl', 'wb') as picklefile:
    pickle.dump(all_data, picklefile)

In [27]:
#checking that the dictionary got pickled 
with open("all_data.pkl", 'rb') as picklefile: 
    all_data = pickle.load(picklefile)

In [28]:
len(all_data[5])

266