# Python script for analyzing the EDH dataset
*Created by: Vojtech Kase, Petra Hermankova*


Requirements:
*   Google Colab account 
*   Access to Sciencedata.dk or access alternatively to the dataset in JSON
*   Basic knowledge of Python (how to run scripts in Python notebooks)



In [1]:
### REQUIREMENTS - will install the libraries
import numpy as np
import math
import pandas as pd
import sys

### we do a lot of requests during the scrapping. Some of them with requests package, some of them with urllib
import requests
from urllib.request import urlopen 
from urllib.parse import quote  
from bs4 import BeautifulSoup
import xml.etree.cElementTree as ET

import zipfile
import io

# to avoid errors, we sometime use time.sleep(N) before retrying a request
import time

# the input data have typically a json structure
import json
import getpass

import datetime as dt

# for simple paralel computing:
from concurrent.futures import ThreadPoolExecutor

!pip install --ignore-installed --index-url https://test.pypi.org/simple/ --no-deps sddk ### our own package under construction, always install to have up-to-date version
import sddk

Looking in indexes: https://test.pypi.org/simple/
Collecting sddk
  Using cached https://test-files.pythonhosted.org/packages/65/8b/d682c15a7335215ac119538ad8455b408cd7e8be4f6614678888dd2c88ed/sddk-0.0.7-py3-none-any.whl
Installing collected packages: sddk
Successfully installed sddk-0.0.7


## Establishing connection to the Sciencedata.dk: configure session and group URL

In [6]:
### configure session and groupurl
### in the case of "SDAM_root", the group owner is Vojtech with username 648597@au.dk
s, sddk_url = sddk.configure_session_and_url("SDAM_root")

sciencedata.dk username (format '123456@au.dk'): 648560@au.dk
sciencedata.dk password: ··········
endpoint for requests has been configured to: https://sciencedata.dk/files/


## Connecting to the preprocessed and enriched JSON file / dataframe from sciencedata.dk


In [7]:
### Once the connection has been succesfuilly established, we can upload the data from sciencedata into Pandas dataframe
### Look at Pandas documention to learn how to navigate Pandas dataframe with their endless functionality
EDH_df = pd.DataFrame(s.get(sddk_url + "SDAM_data/EDH/EDH_inscriptions_rich.json").json())
EDH_df.set_index("id", inplace=True) ### perhaps the best index is the "ID"
EDH_df.head(5) ### use ".head(5)" to inspect first 5 rows of the dataframe

JSONDecodeError: ignored

# Working offline (if the connection to Sciencedata.dk fails)
You need to have an offline version of the enriched JSON file.

In [0]:
# for uploading offline files from the local computer (loading may take few minutes in case of large files)

from google.colab import files
uploaded = files.upload()

In [0]:
EDH_df = pd.read_json("EDH_inscriptions_rich.json") # pandas load the json file and saves it as new object
EDH_df.set_index("id", inplace=True) ### indexing by ID
EDH_df.head(5) ### use ".head(5)" to inspect first 5 rows of the dataframe

In [0]:
# Inspect how many rows and columns we have
EDH_df.shape

## Subsetting the dataset

In [0]:
# Inspect all unique values within "type_of_inscription"
EDH_df["type_of_inscription"].unique()

In [0]:
# Example how to subset the dataset, this time based on a specific string in the type of inscription
EDH_miles = EDH_df[EDH_df["type_of_inscription"].str.startswith("mile-/lea", na=False)]
len(EDH_miles) ### shows how many records in the dataset fulfils the condition

In [0]:
EDH_miles.head(2) # shows the first (2) rows of the dataset

In [0]:
# how to show only the dated ones
EDH_miles_date = EDH_miles[EDH_miles["origdate_text"].str.startswith("", na=False)]
len(EDH_miles_date) ### how long it is?



In [0]:
# selects only the milestones in the province Sardinia
EDH_miles_sardinia = EDH_miles[EDH_miles["province_label"].str.startswith("Sardinia", na=False)]
len(EDH_miles_sardinia)


### Saving the subset as CSV file

In [0]:
# If you need to save the subset into a CSV and save it into a local computer
from google.colab import files
EDH_miles.to_csv('EDH_milestones.csv') 
files.download('EDH_milestones.csv')

In [0]:
# prints as CSV into a local computer
from google.colab import files
EDH_miles_sardinia.to_csv('EDH_milestones_sardinia.csv') 
files.download('EDH_milestones_sardinia.csv')

## Inscriptions from one province (example of sardinia)

In [0]:
EDH_df["province_label"].unique()

In [0]:
# subset based on the name of province 
EDH_sardinia = EDH_df[EDH_df["province_label"].str.startswith("Sardinia", na=False)]
len(EDH_sardinia) ### how long it is?

In [0]:
# prints as CSV into a local computer
from google.colab import files
EDH_sardinia.to_csv('EDH_all_sardinia.csv') 
files.download('EDH_all_sardinia.csv')

### Example fo Thrace

In [0]:
### to get a smaller dataset 
EDH_thracia = EDH_df[EDH_df["province_label"].str.startswith("Thracia", na=False)]
len(EDH_thracia) ### how long it is?

In [0]:
# prints as CSV into a local computer
from google.colab import files
EDH_thracia.to_csv('EDH_all_thracia.csv') 
files.download('EDH_all_thracia.csv')

### Example of Meosia Inferior

In [0]:
### to get a smaller dataset 
EDH_moesia_inf = EDH_df[EDH_df["province_label"].str.startswith("Moesia inf", na=False)]
len(EDH_moesia_inf) ### how long it is?

In [0]:
# prints as CSV into a local computer
from google.colab import files
EDH_moesia_inf.to_csv('EDH_all_moesia_inf.csv') 
files.download('EDH_all_moesia_inf.csv')

# Working with one CSV file

If you prefer to work with one CSV file (containing a subset of all data), instead of the large JSON.

The aim is to find all inscriptions containing mentions of a road, people using the road or any of the establishments and buildings associated with roads.

In [0]:
# loads CSV and displays first three records to check
Sardinia = pd.read_csv('EDH_all_sardinia.csv', sep=',')
Sardinia.head(3)

In [0]:
# searches through text for a specific term and outputs only those inscriptions containing the full term
language = ['Latin', 'Greek']
sardinia_lang = Sardinia.loc[Sardinia['language'].isin(language)]
sardinia_lang.head(2)


In [0]:
# using partial strings to find specific inscriptions, https://stackoverflow.com/questions/11350770/select-by-partial-string-from-a-pandas-dataframe
# example of one term search, using regexes
Sardinia[Sardinia['transcription'].str.contains(r'viat')]


In [0]:
# list based search, searches for all the occurences of the terms in the list roads_vocab
roads_vocab = ['\bvia\b', '\bviat', '\bmansio', '\bmutatio','\bmilia', 'millia', '\bpassuum', '\bcaput', '\bpons', '\bpont']
Sardinia[Sardinia['transcription'].str.contains('|'.join(roads_vocab))]


### List based search for an entire JSON dataset

In [0]:
# for uploading offline files from the local computer (loading will take few minutes in case of large files)

from google.colab import files
uploaded = files.upload()

In [0]:
EDH_df = pd.read_json("EDH_inscriptions_rich.json") # pandas load the json file and saves it as new object
EDH_df.set_index("id", inplace=True) ### index is the "ID"
EDH_df.head(3)

In [0]:
# list based serach, searches for all terms in the list
roads_vocab = ['\bvia\b', '\bviat', '\bmansio', '\bmutatio','\bmilia', 'millia', '\bpassuum', '\bcaput', '\bpons', '\bpont']
EDH_roads_vocab = EDH_df[EDH_df['transcription'].str.contains('|'.join(roads_vocab), na=False)]
len(EDH_roads_vocab)

In [0]:
# prints as CSV into a local computer
from google.colab import files
EDH_roads_vocab.to_csv('EDH_roads_vocab.csv') 
files.download('EDH_roads_vocab.csv')