<a href="https://colab.research.google.com/github/sinairusinek/JRoL/blob/main/extract_csv_for_palladio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

***Streamlining correspondence visualization***
This notebook was written by Dror Guldin, Avigail Friedland and Anne K in the framework of the Potsdam DH-Jewish Hackathon 2022 in order to streamline correspondence visualization from an XML-TEI edition to Palladio. To use it, you would need to:

**Input**:
1. Name of the "ego" person
2. XML that has a correspDesc
3. TSV with columns:
  1. place_name
  2. coor (lat,lon)

Output:
1. TSV with the following format:
  1. source (text from "sent" persName)
  2. correspondent (text from "sent"/"received" persName which does not equal to "ego" person)
  3. target (text from "received" persName)
  4. place_sender (text from "sent" placeName)
  5. senderCoor (the "lat,lon" associated with place_sender in the TSC) 
  6. place_recipient (text from "received" placeName)
  7. recipientCoor (the "lat,lon" associated with place_recipient in the TSC)
  8. date (YYYY-MM-DD) ("when" from "sent" date)
  9. year (YYYY of date)

## Imports

In [1]:
from google.colab import files
import xml.etree.ElementTree as ET
import csv
import pandas as pd
from io import BytesIO

## Input

In [2]:
EGO_PERSOM_NAME = 'י. ח. ברנר'

In [3]:
XML_WITH_CORRESPDESC = files.upload()

Saving BrennerFromLondon-1663427044037.xml to BrennerFromLondon-1663427044037.xml


In [4]:
TSV_WITH_COORDINATES = files.upload()

Saving coordinates_mokup - Sheet1.tsv to coordinates_mokup - Sheet1.tsv


## Extract Needed information

### From XML file

In [5]:
abstract_xml_tree = ET.fromstring(XML_WITH_CORRESPDESC[list(XML_WITH_CORRESPDESC.keys())[0]])

In [25]:
letters_metadata_dict = {}

for tei_element in abstract_xml_tree:
  dict_keys_list = list(tei_element.attrib.keys())
  dict_vals_list = list(tei_element.attrib.values())
  if len(dict_keys_list) > 0:
    if dict_keys_list[0] == '{http://www.w3.org/XML/1998/namespace}id':
      tei_xml_id = dict_vals_list[0]
      letters_metadata_dict[tei_xml_id] = {}
      for tei_component in tei_element:
        if 'teiHeader' in tei_component.tag:
          for tei_header_component in tei_component:
            if 'profileDesc' in tei_header_component.tag:
              for corresp_desc_component in tei_header_component:
                for corresp_action in corresp_desc_component:
                  action_type = corresp_action.attrib['type']
                  letters_metadata_dict[tei_xml_id][f'{action_type}_person'] = {}
                  letters_metadata_dict[tei_xml_id][f'{action_type}_place'] = {}
                  letters_metadata_dict[tei_xml_id][f'{action_type}_date'] = {}
                  for action_component in corresp_action:
                    if 'persName' in action_component.tag:
                      action_person_text = action_component.text
                      letters_metadata_dict[tei_xml_id][f'{action_type}_person'] = action_person_text
                    if 'placeName' in action_component.tag:
                      action_place_text = action_component.text
                      letters_metadata_dict[tei_xml_id][f'{action_type}_place'] = action_place_text
                    if 'date' in action_component.tag and len(action_component.attrib)>0:
                      action_date_when = action_component.attrib['when']
                      letters_metadata_dict[tei_xml_id][f'{action_type}_date'] = action_date_when

In [26]:
relations_df = pd.DataFrame(letters_metadata_dict).T.reset_index()

In [27]:
relations_df['correspondent'] = relations_df.apply(lambda x: x.received_person if x.sent_person == EGO_PERSOM_NAME else x.sent_person, axis=1)

In [28]:
relations_df = relations_df.rename(columns={'index':'letter_xml_index',
                             'sent_person':'source',
                             'sent_place':'place_sender',
                             'received_person':'target',
                             'received_place':'place_recipient'})

In [29]:
places_and_coordinates_df = pd.read_table(BytesIO(TSV_WITH_COORDINATES[list(TSV_WITH_COORDINATES.keys())[0]]))
places_and_coordinates_df = places_and_coordinates_df.set_index('place_name')

In [30]:
relations_df = relations_df.set_index('place_sender').join(places_and_coordinates_df, how='left').reset_index().rename(columns={'index':'place_sender', 'coor':'senderCoor'})

In [33]:
relations_df.to_csv('data_for_palladio.tsv',index=False,sep='\t')

## Output

In [34]:
files.download('data_for_palladio.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>