# Getting started with PyEpiDoc

## Loading an EpiDoc file and accessing text

In [1]:
# Load the API and helper functions
from pyepidoc import (
    EpiDoc, 
    EpiDocCorpus, 
    print_items,
    lang,
    doc_id,
    owner_doc
)

# from pyepidoc.epidoc.dom import owner_doc

In [2]:
# Load an EpiDoc file without validation
doc = EpiDoc('examples/ISic000001_tokenized.xml')

In [3]:
# Load an EpiDoc file and validate
doc2 = EpiDoc('examples/ISic000001_tokenized.xml', validate_on_load=True)

# NB This gives a message "RNG internal error trying to compile notAllowed", even if the file is valid, for reasons currently unknown to me

examples/ISic000001_tokenized.xml is a valid EpiDoc file


RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile not

In [4]:
# Print validation result
print(doc2.validation_result)

examples/ISic000001_tokenized.xml is valid


In [5]:
# Print normalized tokens as a string
print(doc.text_normalized)

Dis manibus Zethi vixit annis VI


In [6]:
# Print Leiden tokens as a string
print(doc.text_leiden)


Dis · man(ibus)
 · Zethi
vix(it) · a(nnis) · VI


In [7]:
# Print the translation text
print(doc.translation_text)

 


## See the XML

In [8]:
print(doc.xml_str)

<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="http://www.stoa.org/epidoc/schema/latest/tei-epidoc.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
<?xml-model href="../schematron/ircyr-checking.sch" schematypens="http://purl.oclc.org/dsdl/schematron"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude" xml:lang="en">
    <teiHeader>
        <fileDesc>
            <titleStmt>
                <title>Funerary inscription of Zethus</title>
                <editor ref="#JP">Jonathan Prag</editor>
                <principal ref="#JP">Jonathan Prag</principal>
                <funder>John Fell OUP Research Fund</funder>
	   <funder><ref target="https://cordis.europa.eu/project/id/885040">ERC Advanced Grant no.885040</ref></funder>
                <respStmt>
                    <name xml:id="JP" ref="http://orcid.org/0000-0003-3819-8537">Jonathan Prag</name>
                    <resp>original data collection and editin

## Load a corpus

In [9]:
# Load the corpus
corpus = EpiDocCorpus('example_corpus')

In [10]:
# Count the documents
print(corpus.doc_count)

71


In [11]:
# Get the range of dates
print(corpus.datemin, corpus.datemax)

-550 401


# Excursus: using `print_items`

`print_items` is a function within PyEpiDOc to display lists of things. The default setting is 
to print each item on a new line preceded by "- ", e.g.:

In [12]:
print_items([1, 2, 3])

- 1
- 2
- 3


However, it is possible to change this behaviour. For example, you can set your own prefix, e.g. an asterisk, with the `prefix` parameter:

In [13]:
print_items([1, 2, 3], prefix='* ')

* 1
* 2
* 3


You can also have no prefix at all:

In [14]:
print_items([1, 2, 3], prefix='')

1
2
3


You can change how each item is separated by using the `sep` parameter. Instead of a new line, you can separate each item with a comma, for example:

In [15]:
print_items([1, 2, 3], sep=', ', prefix='')

1, 2, 3


## Filter the corpus

## By text class

In [16]:
# Find all the funerary inscriptions

funerary = corpus.filter_by_textclass(['#function.funerary'])
print(funerary.doc_count)

40


## By text class and orig place

In [17]:
# Chaining filters
catina_funerary = corpus.filter_by_textclass(['#function.funerary']).filter_by_orig_place(['Catina'])

In [18]:
print_items(catina_funerary.ids)

- ISic000002
- ISic000003


In [19]:
# The original corpus is unchanged
print(corpus.doc_count)

71


## By text class and date range

In [20]:
# Filter by date
imperial_funerary = corpus.filter_by_daterange(start=-1, end=200).filter_by_textclass(['#function.funerary'])

In [21]:
print_items(imperial_funerary.ids)

- ISic000006
- ISic000813
- ISic000819
- ISic000853


In [22]:
# Show the text of particular inscription
print(imperial_funerary.get_doc_by_id('ISic000006').text_leiden)


C(aius) · Iulius · Felix
vixit · ann(os)
Appuleia · Rogata
vixit · ann(os)


## By `<g> @ref`

In [23]:
interpunct_corpus = corpus.filter_by_g_ref(['#interpunct'])
ids = interpunct_corpus.ids
print_items(ids)                                    

- ISic000001
- ISic000002
- ISic000003
- ISic000004
- ISic000005
- ISic000006
- ISic000007
- ISic000009
- ISic000010
- ISic000017
- ISic000801
- ISic000803
- ISic000804
- ISic000805
- ISic000813
- ISic000814
- ISic000842
- ISic000844
- ISic000846
- ISic000847
- ISic000848
- ISic000850
- ISic000853
- ISic000854
- ISic000855
- ISic000856
- ISic000857
- ISic000858
- ISic000859


In [24]:
interpuncts = [punct for punct in interpunct_corpus.gs
             if punct.ref == '#interpunct']
print(len(interpuncts))

194


## By `<num> @value`

In [25]:
num_value_corpus = corpus.filter_by_num_value(min=5, max=10)
ids = num_value_corpus.ids
print_items(ids)

- ISic000001


In [26]:
nums = ['='.join([num.leiden_form, num.value]) for num in num_value_corpus.nums 
             if int(num.value) > 4 and int(num.value) < 11]
print_items(nums)

- VI=6


## By `<name>`

In [27]:
cognomen_corpus = corpus.filter_by_name_type(['cognomen'])
ids = cognomen_corpus.ids
print_items(ids)

- ISic000002
- ISic000004
- ISic000005
- ISic000006
- ISic000007
- ISic000008


In [28]:
cognomina = [name for name in cognomen_corpus.names 
             if name.name_type == 'cognomen']
print_items(cognomina)

- Zosimus
- Melanthi··n
- Gramme
- Long
- Pistianus
- Eupraxia
- Eutychiano
- Felix
- Rogata
- Pio
- Rufus
- Pothine
- Euphrosyne


## By `<roleName>`

### By `@subtype`

In [29]:
# Print the document ids of the files that contain at least one
# role name with type 'imperator'
imperator_corpus = corpus.filter_by_role_name_subtype(['imperator'])
print_items(imperator_corpus.ids)

- ISic000007
- ISic000017


In [30]:
# Print all the role name text values in all the files that
# contain at least one role name of subtype 'imperator'
role_names = imperator_corpus.role_names
print_items(role_names)

- imperatore
- augure
- consule · designato
- legatus · pro · praetore
- praetor · designatus
- Imperatoris
- pontificis · maximi
- tribunicia · potestate
- imperatori
- tribunicia · potestate
- imperatori
- consulis
- patri · patriae
- imperatoris
- tribunicia · potestate
- domino
- II · virorum
- decurionum


In [31]:
# Print only the role names with subtype value 'imperator'
imperator_role_names = filter(
    lambda role_name: role_name.role_name_subtype == 'imperator', 
    imperator_corpus.role_names
)
print_items(imperator_role_names)

- imperatore
- Imperatoris
- imperatori
- imperatori
- imperatoris


### By `<roleName>` @type

In [32]:
# You can also filter by role name type
civic_corpus = corpus.filter_by_role_name_type(['civic'])
civic_role_names = list(filter(
    lambda role_name: role_name.role_name_type == 'civic', 
    civic_corpus.role_names
))
print_items(civic_role_names)


- IIduumviratus
- II · virorum
- decurionum


In [33]:
print_items(civic_corpus.ids)

- ISic000004
- ISic000017


In [34]:
# Print the subtypes of the role_names you have found
civic_role_subtypes = list(map(
    lambda role_name: role_name.role_name_subtype, 
    civic_role_names
))
civic_role_subtypes = [role_name.role_name_subtype for role_name in civic_role_names]
print_items(civic_role_subtypes)


- duumvir
- duumvir
- decurio


## Abbreviations

In [35]:
# Get all the abbreviations in the corpus

expans = corpus.expans
print(len(expans))

198


In [36]:
# Get all the suspensions
print_items(filter(lambda expan: expan.is_suspension == True, expans))

- man(ibus)
- vix(it)
- a(nnis)
- D(is)
- M(anibus)
- s(acrum)
- vix(it)
- Man(ibus)
- vix(it)
- an(nis)
- Herm(es)
- fe(cerunt)
- P(ubli)
- f(ilius)
- P(ublius)
- P(ubli)
- f(ilius)
- P(ubli)
- n(epos)
- f(ilius)
- D(is)
- M(anibus)
- S(acrum)
- M(arcus)
- vix(it)
- ann(is)
- mens(is)
- dieb(us)
- C(aius)
- ann(os)
- ann(os)
- Mag(no)
- Mag(ni)
- f(ilio)
- imp(eratore)
- desig(nato)
- L(ucius)
- L(uci)
- f(ilius)
- leg(atus)
- pr(aetore)
- pr(aetor)
- des(ignatus)
- f(aciendum)
- c(uravit)
- D(is)
- M(anibus)
- fil(ia)
- pient(issimae)
- L(ucius)
- q(uaestor)
- pr(o)
- pr(aetore)
- prov(inciae)
- Sicil(iae)
- leg[(atus)]
- pr(o)
- pr(aetore)
- prov(inciae)
- eiusd(em)
- pr(aetor)
- des[(ignatus)]
- M(arcus)
- L(ucio)
- nobiliss(imo)
- Caes(ari)
- Imp(eratoris)
- Caes(aris)
- L(uci)
- Aug(usto)
- Arab(ici)
- Adi|aben(ici)
- Parth(ici)
- mạx̣(imi)
- pont(ificis)
- max[(imi)]
- tr(ibunicia)
- pot(estate)
- imp(eratori)
- tr(ibunicia)
- pot(estate)
- imp(eratori)
- p(atri)
- p(atriae)
- 

In [37]:
# Get all the Greek suspensions
greek_suspensions = list(filter(
    lambda expan: expan.is_suspension == True and lang(expan) == 'grc', 
    expans))
print_items(greek_suspensions)



- ἕνεκ(εν)
- λαμπρ(οτάτου)
- ὑπατ(ικὸν)
- Θ(εοῖς)
- Κ(αταχθονίοις)
- μῆν(ας)
- χλῖ(ρε)
- μ(ῆνας)
- Θ(εοῖς)
- Κ(αταχθονίοις)
- Θ(εοῖς)
- Κ(αταχθονίοις)
- ἔζησ(εν)
- Θ(εοῖς)
- Κ(αταχθονίοις)
- Κ(λαυδία)
- ἔζ(ησεν)
- μῆ(νας)
- ἡμέ(ρας)
- Ν(ουμέριε)
- μῆν(ας)
- ἡμέρ(ας)
- Κ(υίντος)


In [38]:
# Get the document IDs
ids = map(doc_id, greek_suspensions)
print_items(ids)

- ISic000800
- ISic000811
- ISic000812
- ISic000840
- ISic000840
- ISic000842
- ISic000843
- ISic000844
- ISic000845
- ISic000845
- ISic000852
- ISic000852
- ISic000852
- ISic000853
- ISic000853
- ISic000853
- ISic000853
- ISic000853
- ISic000853
- ISic000854
- ISic000857
- ISic000857
- ISic000858


In [39]:
# Get all the Latin contractions with suspension
latin_contractions_with_suspensions = list(filter(
    lambda expan: expan.is_contraction_with_suspension == True and lang(expan) == 'la', 
    expans))
print_items(latin_contractions_with_suspensions)

print_items(map(doc_id, latin_contractions_with_suspensions))

- co(n)s(ule)
- co(n)s(ulis)
- proco(n)s(uli)
- ISic000007
- ISic000017
- ISic000805


In [40]:
# Get the documents

docs = list(map(owner_doc, latin_contractions_with_suspensions))
print_items(map(lambda doc: doc.date_mean, docs))

- -37
- None
- 250


## Tokenizing EpiDoc

In [41]:
doc = EpiDoc('examples/ISic000032_untokenized.xml')
print(doc.tokens)

[]


In [42]:
print(doc.text_xml)

 Marcus · Cornelius · Res Sextus · Clodi 


In [43]:
print(doc.text_leiden)




In [44]:
doc.tokenize()
print(doc.tokens)

Tokenizing ISic000032...
Prettifying ISic000032...
[Element('w': 'Marcus'), Element('w': 'Cornelius'), Element('w': 'Res'), Element('w': 'Sextus'), Element('w': 'Clodi')]


In [45]:
print(doc.text_xml)

 Marcus · Cornelius · Res Sextus · Clodi 


In [46]:
print(doc.text_leiden)


M(arcus) · Cornelius · Ṛẹṣ[-?-]
Sex(tus) · Clodị[-?-]


In [47]:
print(doc.text_normalized)

Marcus Cornelius Res Sextus Clodi


## Setting IDs

In [48]:
print(doc.editions()[0].xml_str)

<div xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude" type="edition" xml:space="preserve" xml:lang="la">
                <ab>
                    <lb n="1"/><w><expan><abbr>M</abbr><ex>arcus</ex></expan></w> <g ref="#interpunct">·</g> <w>Cornelius</w> <g ref="#interpunct">·</g> <w><unclear>Res</unclear></w> <gap reason="lost" extent="unknown" unit="character"/>
                    <lb n="2"/><w><expan><abbr>Sex</abbr><ex>tus</ex></expan></w> <g ref="#interpunct">·</g> <w>Clod<unclear>i</unclear></w> <gap reason="lost" extent="unknown" unit="character"/>
                </ab>
            </div>
            



In [49]:
print_items(doc.ids)




In [50]:
doc.set_ids()
print_items(doc.ids)

- ADUAK
- ADUAU
- ADUAe
- ADUAo
- ADUAy
- ADUAΙ
- ADUAΤ
- ADUAε
- ADUAο
- ADUBA
- ADUBK
- ADUBU
- ADUBe
- ADUBo
- ADUBy
- ADUBΙ
- ADUBΤ
- ADUBε
- ADUBο
- ADUCA


In [51]:
print(doc.editions()[0].xml_str)

<div xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude" type="edition" xml:space="preserve" xml:lang="la">
                <ab>
                    <lb n="1" xml:id="ADUAK"/><w xml:id="ADUAU"><expan xml:id="ADUAe"><abbr xml:id="ADUAo">M</abbr><ex xml:id="ADUAy">arcus</ex></expan></w> <g ref="#interpunct" xml:id="ADUAΙ">·</g> <w xml:id="ADUAΤ">Cornelius</w> <g ref="#interpunct" xml:id="ADUAε">·</g> <w xml:id="ADUAο"><unclear xml:id="ADUBA">Res</unclear></w> <gap reason="lost" extent="unknown" unit="character" xml:id="ADUBK"/>
                    <lb n="2" xml:id="ADUBU"/><w xml:id="ADUBe"><expan xml:id="ADUBo"><abbr xml:id="ADUBy">Sex</abbr><ex xml:id="ADUBΙ">tus</ex></expan></w> <g ref="#interpunct" xml:id="ADUBΤ">·</g> <w xml:id="ADUBε">Clod<unclear xml:id="ADUBο">i</unclear></w> <gap reason="lost" extent="unknown" unit="character" xml:id="ADUCA"/>
                </ab>
            </div>
            



In [52]:
print(doc.validate()[1])

examples/ISic000032_untokenized.xml is valid EpiDoc according to the RelaxNG schema


RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile not