# Getting started with PyEpiDoc

## Loading an EpiDoc file and accessing text

In [1]:
# Load the API and helper functions
from pyepidoc import (
    EpiDoc, 
    EpiDocCorpus, 
    print_items,
    lang,
    doc_id,
    owner_doc
)

# from pyepidoc.epidoc.dom import owner_doc

In [2]:
# Load an EpiDoc file without validation
doc = EpiDoc('examples/ISic000001_tokenized.xml')

In [3]:
# Load an EpiDoc file and validate
doc2 = EpiDoc('examples/ISic000001_tokenized.xml', validate_on_load=True)

# NB This gives a message "RNG internal error trying to compile notAllowed", even if the file is valid, for reasons currently unknown to me

RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile not

examples/ISic000001_tokenized.xml is a valid EpiDoc file


RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile notAllowed
RNG internal error trying to compile not

In [4]:
# Print validation result
print(doc2.validation_result)

examples/ISic000001_tokenized.xml is valid


In [5]:
# Print normalized tokens as a string
print(doc.text_normalized)

Dis manibus Zethi vixit annis VI


In [6]:
# Print Leiden tokens as a string
print(doc.text_leiden)


Dis · man(ibus)
 · Zethi
vix(it) · a(nnis) · VI


In [7]:
# Print the translation text
print(doc.translation_text)

 


## See the XML

In [8]:
print(doc.xml_str)

<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="http://www.stoa.org/epidoc/schema/latest/tei-epidoc.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
<?xml-model href="../schematron/ircyr-checking.sch" schematypens="http://purl.oclc.org/dsdl/schematron"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude" xml:lang="en">
    <teiHeader>
        <fileDesc>
            <titleStmt>
                <title>Funerary inscription of Zethus</title>
                <editor ref="#JP">Jonathan Prag</editor>
                <principal ref="#JP">Jonathan Prag</principal>
                <funder>John Fell OUP Research Fund</funder>
	   <funder><ref target="https://cordis.europa.eu/project/id/885040">ERC Advanced Grant no.885040</ref></funder>
                <respStmt>
                    <name xml:id="JP" ref="http://orcid.org/0000-0003-3819-8537">Jonathan Prag</name>
                    <resp>original data collection and editin

## Load a corpus

In [9]:
# Load the corpus
corpus = EpiDocCorpus('example_corpus')

In [10]:
# Count the documents
print(corpus.doc_count)

71


In [11]:
# Get the range of dates
print(corpus.datemin, corpus.datemax)

-550 401


# Excursus: using `print_items`

`print_items` is a function within PyEpiDOc to display lists of things. The default setting is 
to print each item on a new line preceded by "- ", e.g.:

In [12]:
print_items([1, 2, 3])

- 1
- 2
- 3


However, it is possible to change this behaviour. For example, you can set your own prefix, e.g. an asterisk, with the `prefix` parameter:

In [13]:
print_items([1, 2, 3], prefix='* ')

* 1
* 2
* 3


You can also have no prefix at all:

In [14]:
print_items([1, 2, 3], prefix='')

1
2
3


You can change how each item is separated by using the `sep` parameter. Instead of a new line, you can separate each item with a comma, for example:

In [15]:
print_items([1, 2, 3], sep=', ', prefix='')

1, 2, 3


## Filter the corpus

## By text class

In [16]:
# Find all the funerary inscriptions

funerary = corpus.filter_by_textclass(['#function.funerary'])
print(funerary.doc_count)

40


## By text class and orig place

In [17]:
# Chaining filters
catina_funerary = corpus.filter_by_textclass(['#function.funerary']).filter_by_orig_place(['Catina'])

In [18]:
print_items(catina_funerary.ids)

- ISic000002
- ISic000003


In [19]:
# The original corpus is unchanged
print(corpus.doc_count)

71


## By text class and date range

In [20]:
# Filter by date
imperial_funerary = corpus.filter_by_daterange(start=-1, end=200).filter_by_textclass(['#function.funerary'])

In [21]:
print_items(imperial_funerary.ids)

- ISic000006
- ISic000813
- ISic000819
- ISic000853


In [22]:
# Show the text of particular inscription
print(imperial_funerary.get_doc_by_id('ISic000006').text_leiden)


C(aius) · Iulius · Felix
vixit · ann(os)
Appuleia · Rogata
vixit · ann(os)


# By `<g> @ref`

In [23]:
interpunct_corpus = corpus.filter_by_g_ref(['#interpunct'])
ids = interpunct_corpus.ids
print_items(ids)                                    

- ISic000001
- ISic000002
- ISic000003
- ISic000004
- ISic000005
- ISic000006
- ISic000007
- ISic000009
- ISic000010
- ISic000017
- ISic000801
- ISic000803
- ISic000804
- ISic000805
- ISic000813
- ISic000814
- ISic000842
- ISic000844
- ISic000846
- ISic000847
- ISic000848
- ISic000850
- ISic000853
- ISic000854
- ISic000855
- ISic000856
- ISic000857
- ISic000858
- ISic000859


In [29]:
interpuncts = [punct for punct in interpunct_corpus.gs
             if punct.ref == '#interpunct']
print(len(interpuncts))

194


# By `<name>`

In [None]:
cognomen_corpus = corpus.filter_by_name_type(['cognomen'])
ids = cognomen_corpus.ids
print_items(ids)

In [None]:
cognomina = [name for name in cognomen_corpus.names 
             if name.name_type == 'cognomen']
print_items(cognomina)

## By `<roleName>`

### By `@subtype`

In [None]:
# Print the document ids of the files that contain at least one
# role name with type 'imperator'
imperator_corpus = corpus.filter_by_role_name_subtype(['imperator'])
print_items(imperator_corpus.ids)

In [None]:
# Print all the role name text values in all the files that
# contain at least one role name of subtype 'imperator'
role_names = imperator_corpus.role_names
print_items(role_names)

In [None]:
# Print only the role names with subtype value 'imperator'
imperator_role_names = filter(
    lambda role_name: role_name.role_subtype == 'imperator', 
    imperator_corpus.role_names
)
print_items(imperator_role_names)

### By role type

In [None]:
# You can also filter by role name type
civic_corpus = corpus.filter_by_role_type(['civic'])
civic_role_names = list(filter(
    lambda role_name: role_name.role_type == 'civic', 
    civic_corpus.role_names
))
print_items(civic_role_names)


In [None]:
print_items(civic_corpus.ids)

In [None]:
# Print the subtypes of the role_names you have found
civic_role_subtypes = list(map(
    lambda role_name: role_name.role_subtype, 
    civic_role_names
))
civic_role_subtypes = [role_name.role_subtype for role_name in civic_role_names]
print_items(civic_role_subtypes)


## Abbreviations

In [None]:
# Get all the abbreviations in the corpus

expans = corpus.expans
print(len(expans))

In [None]:
# Get all the suspensions
print_items(filter(lambda expan: expan.is_suspension == True, expans))

In [None]:
# Get all the Greek suspensions
greek_suspensions = list(filter(
    lambda expan: expan.is_suspension == True and lang(expan) == 'grc', 
    expans))
print_items(greek_suspensions)



In [None]:
# Get the document IDs
ids = map(doc_id, greek_suspensions)
print_items(ids)

In [None]:
# Get all the Latin contractions with suspension
latin_contractions_with_suspensions = list(filter(
    lambda expan: expan.is_contraction_with_suspension == True and lang(expan) == 'la', 
    expans))
print_items(latin_contractions_with_suspensions)

print_items(map(doc_id, latin_contractions_with_suspensions))

In [None]:
# Get the documents

docs = list(map(owner_doc, latin_contractions_with_suspensions))
print_items(map(lambda doc: doc.date_mean, docs))

## Tokenizing EpiDoc

In [None]:
doc = EpiDoc('examples/ISic000032_untokenized.xml')
print(doc.tokens)

In [None]:
print(doc.text_xml)

In [None]:
print(doc.text_leiden)

In [None]:
doc.tokenize()
print(doc.tokens)

In [None]:
print(doc.text_xml)

In [None]:
print(doc.text_leiden)

In [None]:
print(doc.text_normalized)

## Setting IDs

In [None]:
print(doc.editions()[0].xml_str)

In [None]:
print_items(doc.ids)

In [None]:
doc.set_ids()
print_items(doc.ids)

In [None]:
print(doc.editions()[0].xml_str)

In [None]:
print(doc.validate()[1])