# Getting started with PyEpiDoc

## Loading an EpiDoc file and accessing text

In [1]:
# Load the API and helper functions
from pyepidoc import (
    EpiDoc, 
    EpiDocCorpus, 
    print_items,
    lang,
    doc_id,
    owner_doc
)

# from pyepidoc.epidoc.dom import owner_doc

In [2]:
# Load an EpiDoc file without validation
doc = EpiDoc('../examples/ISic000001_tokenized.xml')

In [None]:
# Load an EpiDoc file and validate
doc2 = EpiDoc('../examples/ISic000001_tokenized.xml', validate_on_load=True)

# NB This gives a message "RNG internal error trying to compile notAllowed", even if the file is valid, for reasons currently unknown to me

In [None]:
# Print validation result
print(doc2.validation_result)

In [None]:
# Print normalized tokens as a string
print(doc.text_normalized)

In [None]:
# Print Leiden tokens as a string
print(doc.text_leiden)

In [None]:
# Print the translation text
print(doc.translation_text)

## See the XML

In [None]:
print(doc.xml_str)

## Load a corpus

In [9]:
# Load the corpus
corpus = EpiDocCorpus('../example_corpus')

In [None]:
# Count the documents
print(corpus.doc_count)

In [None]:
# Get the range of dates
print(corpus.datemin, corpus.datemax)

# Excursus: using `print_items`

`print_items` is a function within PyEpiDOc to display lists of things. The default setting is 
to print each item on a new line preceded by "- ", e.g.:

In [None]:
print_items([1, 2, 3])

However, it is possible to change this behaviour. For example, you can set your own prefix, e.g. an asterisk, with the `prefix` parameter:

In [None]:
print_items([1, 2, 3], prefix='* ')

You can also have no prefix at all:

In [None]:
print_items([1, 2, 3], prefix='')

You can change how each item is separated by using the `sep` parameter. Instead of a new line, you can separate each item with a comma, for example:

In [None]:
print_items([1, 2, 3], sep=', ', prefix='')

## Filter the corpus

## By text class

In [None]:
# Find all the funerary inscriptions

funerary = corpus.filter_by_textclass(['#function.funerary'])
print(funerary.doc_count)

## By text class and orig place

In [17]:
# Chaining filters
catina_funerary = corpus.filter_by_textclass(['#function.funerary']).filter_by_orig_place(['Catina'])

In [None]:
print_items(catina_funerary.ids)

In [None]:
# The original corpus is unchanged
print(corpus.doc_count)

## By text class and date range

In [20]:
# Filter by date
imperial_funerary = corpus.filter_by_daterange(start=-1, end=200).filter_by_textclass(['#function.funerary'])

In [None]:
print_items(imperial_funerary.ids)

In [None]:
# Show the text of particular inscription
print(imperial_funerary.get_doc_by_id('ISic000006').text_leiden)

## By `<g> @ref`

In [None]:
interpunct_corpus = corpus.filter_by_g_ref(['#interpunct'])
ids = interpunct_corpus.ids
print_items(ids)                                    

In [None]:
interpuncts = [punct for punct in interpunct_corpus.gs
             if punct.ref == '#interpunct']
print(len(interpuncts))

## By `<num> @value`

In [None]:
num_value_corpus = corpus.filter_by_num_value(min=5, max=10)
ids = num_value_corpus.ids
print_items(ids)

In [None]:
nums = ['='.join([num.leiden_form, num.value]) for num in num_value_corpus.nums 
             if int(num.value) > 4 and int(num.value) < 11]
print_items(nums)

## By `<name>`

In [None]:
cognomen_corpus = corpus.filter_by_name_type(['cognomen'])
ids = cognomen_corpus.ids
print_items(ids)

In [None]:
cognomina = [name for name in cognomen_corpus.names() 
             if name.name_type == 'cognomen']
print_items(cognomina)

## By `<roleName>`

### By `@subtype`

In [None]:
# Print the document ids of the files that contain at least one
# role name with type 'imperator'
imperator_corpus = corpus.filter_by_role_name_subtype(['imperator'])
print_items(imperator_corpus.ids)

In [None]:
# Print all the role name text values in all the files that
# contain at least one role name of subtype 'imperator'
role_names = imperator_corpus.role_names
print_items(role_names)

In [None]:
# Print only the role names with subtype value 'imperator'
imperator_role_names = filter(
    lambda role_name: role_name.role_name_subtype == 'imperator', 
    imperator_corpus.role_names
)
print_items(imperator_role_names)

### By `<roleName>` @type

In [None]:
# You can also filter by role name type
civic_corpus = corpus.filter_by_role_name_type(['civic'])
civic_role_names = list(filter(
    lambda role_name: role_name.role_name_type == 'civic', 
    civic_corpus.role_names
))
print_items(civic_role_names)


In [None]:
print_items(civic_corpus.ids)

In [None]:
# Print the subtypes of the role_names you have found
civic_role_subtypes = list(map(
    lambda role_name: role_name.role_name_subtype, 
    civic_role_names
))
civic_role_subtypes = [role_name.role_name_subtype for role_name in civic_role_names]
print_items(civic_role_subtypes)


## Abbreviations

In [None]:
# Get all the abbreviations in the corpus

expans = corpus.expans
print(len(expans))

In [None]:
# Get all the suspensions
print_items(filter(lambda expan: expan.is_suspension == True, expans))

In [None]:
# Get all the Greek suspensions
greek_suspensions = list(filter(
    lambda expan: expan.is_suspension == True and lang(expan) == 'grc', 
    expans))
print_items(greek_suspensions)



In [None]:
# Get the document IDs
ids = map(doc_id, greek_suspensions)
print_items(ids)

In [None]:
# Get all the Latin contractions with suspension
latin_contractions_with_suspensions = list(filter(
    lambda expan: expan.is_contraction_with_suspension == True and lang(expan) == 'la', 
    expans))
print_items(latin_contractions_with_suspensions)

print_items(map(doc_id, latin_contractions_with_suspensions))

In [None]:
# Get the documents

docs = list(map(owner_doc, latin_contractions_with_suspensions))
print_items(map(lambda doc: doc.date_mean, docs))

## Tokenizing EpiDoc

In [None]:
doc = EpiDoc('../examples/ISic000032_untokenized.xml')
print(doc.tokens)

In [None]:
print(doc.text_xml)

In [None]:
print(doc.text_leiden)

In [None]:
doc.tokenize()
print(doc.tokens)

In [None]:
print(doc.text_xml)

In [None]:
print(doc.text_leiden)

In [None]:
print(doc.text_normalized)

## Setting IDs

In [None]:
print(doc.editions()[0].xml_str)

In [None]:
print_items(doc.ids)

In [None]:
doc.set_ids()
print_items(doc.ids)

In [None]:
print(doc.editions()[0].xml_str)

In [None]:
print(doc.validate()[1])