# Requirements

In [1]:
import os
import json
import pandas as pd
import numpy as np
import json
import logging ### to monitor the code
from bs4 import BeautifulSoup
import xml.etree.cElementTree as ET
import pickle
import math
import random
import sys
import csv
import unicodedata
import requests
from urllib.request import urlopen 
import io
import getpass
import re
from collections import defaultdict
from itertools import islice # to iterate through dicts

import nltk
from nltk.collocations import *


### plotting
### to use latex (important for greek fonts)
#! apt-get install texlive-latex-recommended 
#! apt install texlive-latex-extra
#! apt install dvipng
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account # based on google-auth library

import sddk

In [2]:
conf = sddk.configure("SDAM_root", "648597@au.dk")

sciencedata.dk username (format '123456@au.dk'): 648597@au.dk
sciencedata.dk password: ········
connection with shared folder established with you as its owner
endpoint variable has been configured to: https://sciencedata.dk/files/SDAM_root/


In [3]:
# to access gsheet, you need Google Service Account key json file
# I have mine located in my personal space on sciencedata.dk, so I read it from there:

# (1) read the file and parse its content
file_data = conf[0].get("https://sciencedata.dk/files/ServiceAccountsKey.json").json()
# (2) transform the content into crendentials object
credentials = service_account.Credentials.from_service_account_info(file_data)
# (3) specify your usage of the credentials
scoped_credentials = credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
# (4) use the constrained credentials for authentication of gspread package
gc = gspread.Client(auth=scoped_credentials)

AGT_overview = gc.open_by_url("https://docs.google.com/spreadsheets/d/1iVta_FuEDgUM_Lf_yByrdbbXNoVH_dnVZs6QRyYv1NM/edit?usp=sharing")

# AGT_metadata = gc.open_by_url("https://docs.google.com/spreadsheets/d/1hEUnL3E07F-EnE3wYnk1V91aXfPDrcnhFHKjD-04CM0/edit?usp=sharing")

# Extract date info from cltk github

In [4]:
request_text = requests.get("https://raw.githubusercontent.com/cltk/cltk/master/cltk/corpus/greek/tlg/author_date.py").text
request_text

"AUTHOR_DATE = {'A.D. 1': ['0031', '0041', '0280', '0283', '0347', '0526', '0587', '0619', '0628', '0643', '0648', '0654', '0656', '0716', '0825', '0888', '1004', '1152', '1155', '1201', '1247', '1271', '1301', '1701', '1703', '1828', '1890', '2424', '4335'], 'a. 1 B.C.': ['0359', '0687', '0750', '1534', '1627', '1678', '1720', '2552', '2652'], 'a. A.D. 11': ['2972', '4303'], 'a. 2 B.C.?': ['2215', '2525'], 'A.D. 9': ['0723', '0738', '3043', '3128', '3177', '4040', '4093', '4097', '4101', '4149'], 'A.D. 11': ['2702', '3027', '3064', '4098', '4235', '4330'], 'A.D. 1?/6': ['4150'], 'Incertum': ['0037', '0038', '0040', '0042', '0043', '0044', '0045', '0046', '0047', '0049', '0050', '0055', '0056', '0092', '0125', '0200', '0202', '0248', '0349', '0350', '0351', '0352', '0353', '0354', '0386', '0394', '0395', '0427', '0430', '0493', '0605', '0622', '0623', '0636', '0637', '0740', '0833', '0876', '0889', '1121', '1161', '1177', '1278', '1281', '1298', '1318', '1325', '1327', '1334', '1411', 

In [5]:
### execute the string
exec(request_text)

In [6]:
### use the variable from the executed string
tlg_authordate = AUTHOR_DATE

In [7]:
### write it out for future usage
#sddk.write_file("SDAM_data/OGL/tlg_authordate.json", tlg_authordate, conf)

In [8]:
### OR YOU CAN START HERE WITH PREPROCESSED DATA:

tlg_authordate = sddk.read_file("SDAM_data/AGT/tlg_authordate.json", "dict", conf)
tlg_authordate.keys()

dict_keys(['A.D. 1', 'a. 1 B.C.', 'a. A.D. 11', 'a. 2 B.C.?', 'A.D. 9', 'A.D. 11', 'A.D. 1?/6', 'Incertum', 'A.D. 3-4', 'a. A.D. 5?', 'A.D. 1/2', 'A.D. 2/4?', 'A.D. 3?', '1 B.C.-A.D. 1?', '6 B.C.', '5/3 B.C.', '8 B.C.', '2 B.C./A.D. 3', 'Varia', 'p. 1 B.C.', '3-2 B.C.?', 'A.D. 10/15', 'a. 6 B.C.', 'A.D. 2/3', 'p. A.D. 10', '7/6 B.C.?', 'a. A.D. 4', 'a. A.D. 1/2', '3/2 B.C.', 'A.D. 6', 'a. A.D. 1', 'A.D. 8-9?', 'A.D. 3/4?', 'A.D. 1-7', 'A.D. 13-14', 'A.D. 2', 'A.D. 6-7', 'p. 3 B.C.', 'p. A.D. 2', 'A.D. 5/7', 'A.D. 5-6', '4/2 B.C.?', 'A.D. 15-16', '7/6 B.C.', 'a. A.D. 3?', 'A.D. 2-3', '2-1 B.C.?', 'p. 4 B.C.?', '2/1 B.C.', 'a. 3 B.C.', '7-6 B.C.', 'a. A.D. 15', '4-3 B.C.?', 'a. A.D. 14/15', 'A.D. 2?', '5/4 B.C.?', '3 B.C.', '2 B.C./A.D. 2', 'a. A.D. 10', 'A.D. 2?/4', '4/3 B.C.', 'A.D. 6-10', '4 B.C./A.D. 2', 'p. 7 B.C.', '4-3 B.C.', '5-4 B.C.', 'A.D. 9-10', '4/2 B.C.', 'A.D. 9/10', '4/1 B.C.', 'A.D. 8', 'a. 3 B.C.?', '4 B.C./A.D. 1', 'A.D. 13', 'A.D. 9?', 'A.D. 8/10', 'p. A.D. 6', '3 B.C

In [9]:
### testing regex
"-" + re.match(r"\d+ B.C.", "1 B.C., A.D. 1")[0].partition(" ")[0]

'-1'

In [10]:
### testing regex
string = "> 2 B.C."
[match[0].replace(" ", "") + "-" + match[1] for match in re.findall(r'(> ?)?(\d+)', string)]

['>-2']

In [11]:
### our centuries of interest are these:
centuries = [el/10 for el in range(-75, 165, 10)]
print(centuries)

[-7.5, -6.5, -5.5, -4.5, -3.5, -2.5, -1.5, -0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5, 11.5, 12.5, 13.5, 14.5, 15.5]


In [12]:
total = 0
for el in range(len(centuries)):
  total += 2**(el+1)

# Cleaning dates

## Define probabilities

In [13]:
date_string = [-2.5, -3.5, -4.5, -5.5, -6.5, -7.5]
weighted_dates = {}
total = 0
for el in range(len(date_string)):
  total += 2** (el+1)
total += 2
for date, el in zip(reversed(date_string), range(len(date_string))):
  base = 2**(el+1)
  print(base, total)
  weighted_dates[date] = base / total

2 128
4 128
8 128
16 128
32 128
64 128


In [14]:
# increasing probability for "after 3rd c. BC"
weighted_dates

{-7.5: 0.015625,
 -6.5: 0.03125,
 -5.5: 0.0625,
 -4.5: 0.125,
 -3.5: 0.25,
 -2.5: 0.5}

## MAIN CLEANING REGEX



In [15]:
### MAIN CLEANING REGEX
dates_cleaned = {}
dates_pairs = {}
for raw_date_string in tlg_authordate.keys():  
  if "/" in raw_date_string:
    date_type = "or"
  elif "-" in raw_date_string:
    date_type = "range"
  elif "p." in raw_date_string:
    date_type = "post"
  elif "a." in raw_date_string:
    date_type = "ante"
  else:
    date_type = ""
  if "?" in raw_date_string:
    certainty = "?"
  else:
    certainty = ""
  date_string = raw_date_string.replace("?", "") ### exclude uncertainty for the moment
  date_string = date_string.replace("/", ", ")
  date_string = date_string.replace("-", ", ")
  if not (("B.C." not in date_string) or ("A.D." not in date_string)): ### in the case there is both "A.D." and "B.C."
    date_string = date_string.replace("A.D.", "")
    try:
      date_string = "-" + re.match(r".*\d+ B.C.", date_string)[0].partition(" ")[0] + date_string.partition("B.C.")[2]
    except:
      pass
    date_string = date_string.replace("a. ", "<") # ante quem
    date_string = date_string.replace("p. ", ">") # post quem
    date_string.split(",")
  else:
    date_string = date_string.replace("A.D.", "")
    if "B.C." in date_string:
      if "p. " in date_string:
        date_string = [">-" + match for match in re.findall(r"\d+", date_string)]
      elif "a. " in date_string:
        date_string = ["<-" + match for match in re.findall(r"\d+", date_string)]
      else:
        date_string = ["-" + match for match in re.findall(r"\d+", date_string)]
    try: 
      date_string = date_string.replace("a. ", "<").replace("p. ", ">") # ante quem # post quem
    except: 
      pass
  try:
    date_string = date_string.split(",")
  except:
    pass
  date_string = [date.replace(" ", "") for date in date_string]
  date_string_ranges = []
  for date in date_string:
    if "<" in date:
      try:
        actual_list = reversed([value for value in list(range(-8, int(date.replace("<", "")))) if value != 0])
        date_string_ranges.extend(actual_list)
      except: 
        date_string_ranges.append(date)
    elif ">" in date:
      try:
        date_string_ranges.extend([value for value in list(range(int(date.replace(">", "")) + 1, 16)) if value != 0])
      except: 
        date_string_ranges.append(date)
    else:
      date_string_ranges.append(date)
  date_string = date_string_ranges
  try:
    date_string = [int(date) for date in date_string]
  except: 
    pass
  if date_type == "range":
    try:
      date_string = [num for num in range(date_string[0], date_string[1] + 1) if num != 0]
    except:
      pass
  cents = []
  for num in date_string:
    try:
      if float(num) < 0:
        cents.append(float(num) + 0.5)
      else:
        cents.append(float(num) - 0.5)
    except:
      cents.append(None)
  ### add a dictionary of weighted dates
  weighted_dates = {}
  try:
    if (date_type == "range") or (date_type == "or"): ### if it is a "range" or "or"
      for el in cents:
        weighted_dates[el] = np.round(1 / len(cents), 4)
    elif (date_type == "post") or (date_type == "ante"):
      total = 0
      for el in range(len(cents)):
        total += 2** (el+1)
      total += 2
      for date, el in zip(reversed(cents), range(len(cents))):
        base = 2**(el+1)
        weighted_dates[date] = np.round(base / total, 4)
    else:
      weighted_dates[cents[0]] = 1
  except:
    weighted_dates = {}
  try:
    date_avr = np.mean(cents[:2])
  except:
    date_avr = None
  dates_cleaned.update({raw_date_string : [cents, weighted_dates, date_avr, date_type, certainty]})
  dates_pairs[raw_date_string] = weighted_dates
list(islice(dates_cleaned.items(), 3))

[('A.D. 1', [[0.5], {0.5: 1}, 0.5, '', '']),
 ('a. 1 B.C.',
  [[-1.5, -2.5, -3.5, -4.5, -5.5, -6.5, -7.5],
   {-7.5: 0.0078,
    -6.5: 0.0156,
    -5.5: 0.0312,
    -4.5: 0.0625,
    -3.5: 0.125,
    -2.5: 0.25,
    -1.5: 0.5},
   -2.0,
   'ante',
   '']),
 ('a. A.D. 11',
  [[9.5,
    8.5,
    7.5,
    6.5,
    5.5,
    4.5,
    3.5,
    2.5,
    1.5,
    0.5,
    -0.5,
    -1.5,
    -2.5,
    -3.5,
    -4.5,
    -5.5,
    -6.5,
    -7.5],
   {-7.5: 0.0,
    -6.5: 0.0,
    -5.5: 0.0,
    -4.5: 0.0,
    -3.5: 0.0001,
    -2.5: 0.0001,
    -1.5: 0.0002,
    -0.5: 0.0005,
    0.5: 0.001,
    1.5: 0.002,
    2.5: 0.0039,
    3.5: 0.0078,
    4.5: 0.0156,
    5.5: 0.0312,
    6.5: 0.0625,
    7.5: 0.125,
    8.5: 0.25,
    9.5: 0.5},
   9.0,
   'ante',
   ''])]

In [16]:
cents = [0.5]
np.mean(cents[:2])

0.5

In [36]:
### manual cleaning:
dates_cleaned['p. 4 B.C./a. A.D. 2'] = [[-2.5, -1.5, -0.5, 0.5], {-2.5 : 0.25, -1.5: 0.25, -0.5: 0.25, 0.5: 0.25}, -1, "range", 'p. 4 B.C./a. A.D. 2']
dates_cleaned['Incertum'] = [[], {}, None, "", "Incertum"]
dates_cleaned['Varia'] = [[], {}, None, "", "Varia"]

### manual cleaning:
dates_pairs['p. 4 B.C./a. A.D. 2'] = {-2.5 : 0.25, -1.5: 0.25, -0.5: 0.25, 0.5: 0.25}
dates_pairs['Incertum'] = {}
dates_pairs['Varia'] = {}

In [34]:
dates_pairs_df = pd.DataFrame(dates_pairs).T
dates_pairs_df.fillna(0, inplace=True)
dates_pairs_df
dates_pairs_df.sort_index(axis=1, inplace=True)
dates_pairs_df.reset_index(inplace=True)
dates_pairs_df.rename({"index" : "raw_date"}, axis=1, inplace=True)
dates_pairs_df.head(5)

Unnamed: 0,raw_date,-7.5,-6.5,-5.5,-4.5,-3.5,-2.5,-1.5,-0.5,0.5,...,6.5,7.5,8.5,9.5,10.5,11.5,12.5,13.5,14.5,15.5
0,A.D. 1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,a. 1 B.C.,0.0078,0.0156,0.0312,0.0625,0.125,0.25,0.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,a. A.D. 11,0.0,0.0,0.0,0.0,0.0001,0.0001,0.0002,0.0005,0.001,...,0.0625,0.125,0.25,0.5,0.0,0.0,0.0,0.0,0.0,0.0
3,a. 2 B.C.?,0.0156,0.0312,0.0625,0.125,0.25,0.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A.D. 9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
list(islice(dates_cleaned.items(), 5))

[('A.D. 1', [[0.5], {0.5: 1}, 0.5, '', '']),
 ('a. 1 B.C.',
  [[-1.5, -2.5, -3.5, -4.5, -5.5, -6.5, -7.5],
   {-7.5: 0.0078,
    -6.5: 0.0156,
    -5.5: 0.0312,
    -4.5: 0.0625,
    -3.5: 0.125,
    -2.5: 0.25,
    -1.5: 0.5},
   -2.0,
   'ante',
   '']),
 ('a. A.D. 11',
  [[9.5,
    8.5,
    7.5,
    6.5,
    5.5,
    4.5,
    3.5,
    2.5,
    1.5,
    0.5,
    -0.5,
    -1.5,
    -2.5,
    -3.5,
    -4.5,
    -5.5,
    -6.5,
    -7.5],
   {-7.5: 0.0,
    -6.5: 0.0,
    -5.5: 0.0,
    -4.5: 0.0,
    -3.5: 0.0001,
    -2.5: 0.0001,
    -1.5: 0.0002,
    -0.5: 0.0005,
    0.5: 0.001,
    1.5: 0.002,
    2.5: 0.0039,
    3.5: 0.0078,
    4.5: 0.0156,
    5.5: 0.0312,
    6.5: 0.0625,
    7.5: 0.125,
    8.5: 0.25,
    9.5: 0.5},
   9.0,
   'ante',
   '']),
 ('a. 2 B.C.?',
  [[-2.5, -3.5, -4.5, -5.5, -6.5, -7.5],
   {-7.5: 0.0156,
    -6.5: 0.0312,
    -5.5: 0.0625,
    -4.5: 0.125,
    -3.5: 0.25,
    -2.5: 0.5},
   -3.0,
   'ante',
   '?']),
 ('A.D. 9', [[8.5], {8.5: 1}, 8.5, '', ''])

In [37]:
[value for value in zip(dates_cleaned.keys(), dates_cleaned.values()) if len(value[1])!=5]

[]

In [39]:
dates_cleaned_df = pd.DataFrame(dates_cleaned).T
dates_cleaned_df.reset_index(inplace=True)
#dates_cleaned_df.columns = ["raw_date", "cents", "date_probs", "date_avr", "type", "cert"]
dates_cleaned_df.head(5)

Unnamed: 0,index,0,1,2,3,4
0,A.D. 1,[0.5],{0.5: 1},0.5,,
1,a. 1 B.C.,"[-1.5, -2.5, -3.5, -4.5, -5.5, -6.5, -7.5]","{-7.5: 0.0078, -6.5: 0.0156, -5.5: 0.0312, -4....",-2.0,ante,
2,a. A.D. 11,"[9.5, 8.5, 7.5, 6.5, 5.5, 4.5, 3.5, 2.5, 1.5, ...","{-7.5: 0.0, -6.5: 0.0, -5.5: 0.0, -4.5: 0.0, -...",9.0,ante,
3,a. 2 B.C.?,"[-2.5, -3.5, -4.5, -5.5, -6.5, -7.5]","{-7.5: 0.0156, -6.5: 0.0312, -5.5: 0.0625, -4....",-3.0,ante,?
4,A.D. 9,[8.5],{8.5: 1},8.5,,


In [58]:
set_with_dataframe(AGT_overview.add_worksheet("tlg_dating_probs", rows=1, cols=1), dates_pairs_df)

In [40]:
author_ids_with_raw_dates = []
for key, list_of_values in zip(tlg_authordate.keys(), tlg_authordate.values()):
  for value in list_of_values:
    author_ids_with_raw_dates.append(["tlg" + value, key])
author_ids_with_raw_dates_df = pd.DataFrame(author_ids_with_raw_dates, columns=["author_id", "raw_date"])
author_ids_with_raw_dates_df.head(5)

Unnamed: 0,author_id,raw_date
0,tlg0031,A.D. 1
1,tlg0041,A.D. 1
2,tlg0280,A.D. 1
3,tlg0283,A.D. 1
4,tlg0347,A.D. 1


In [41]:
tlg_authors_with_dating_probs = pd.merge(author_ids_with_raw_dates_df, dates_pairs_df, how="inner", on="raw_date")
tlg_authors_with_dating_probs.head(5)

Unnamed: 0,author_id,raw_date,-7.5,-6.5,-5.5,-4.5,-3.5,-2.5,-1.5,-0.5,...,6.5,7.5,8.5,9.5,10.5,11.5,12.5,13.5,14.5,15.5
0,tlg0031,A.D. 1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,tlg0041,A.D. 1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,tlg0280,A.D. 1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,tlg0283,A.D. 1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,tlg0347,A.D. 1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
len(tlg_authors_with_dating_probs)

1823

In [0]:
sddk.write_file("SDAM_data/OGL/tlg_authors_with_dating_probs.json", tlg_authors_with_dating_probs, conf)

Your <class 'pandas.core.frame.DataFrame'> object has been succefully written as "https://sciencedata.dk/files/SDAM_root/SDAM_data/OGL/tlg_authors_with_dating_probs.json"


In [43]:
value_pairs = {}
for key, list_of_values in zip(tlg_authordate.keys(), tlg_authordate.values()):
  for value in list_of_values:
    value_pairs["tlg" + value] = dates_cleaned[key]

In [41]:
### checking whether it works
value_pairs["tlg0086"]

[[-3.5], {-3.5: 1}, -3.5, '', '']

In [0]:
sddk.write_file("SDAM_data/OGL/tlg_authordate_cleaned.json", value_pairs, conf)

A file with the same name ("tlg_authordate_cleaned.json") already exists in this location.
Press Enter to overwrite it or choose different path and filename: 
Your <class 'dict'> object has been succefully written as "https://sciencedata.dk/files/SDAM_root/SDAM_data/OGL/tlg_authordate_cleaned.json"


# Dating + dataset

In [44]:
### import manually encoded dating and provenance from sciencedata
data_merged_raw = sddk.read_file("SDAM_data/AGT/AGT_raw.json", "df", conf)

In [45]:
data_merged_raw["author_id"] = data_merged_raw.apply(lambda row: row["filename"][:7], axis=1)
data_merged_raw.head(5)

Unnamed: 0,filename,author,title,string,author_id
0,ggm0001.ggm001.1st1K-grc1.xml,Anonymous,Anametresis Pontou; Geographi Graeci Minores,ΑΝΩΝΥΜΟΥ ΑΝΑΜΕΤΡΗΣΙΣ ΤΗΣ ΟΙΚΟΥΜΕΝΗΣ ΗΑΣ ΗΕ ΚΑΤ...,ggm0001
1,heb0001.heb010.1st1K-eng1.xml,,Isaias; The Book of Isaiah,ISAIAH FROM HEBREW I. 1 The vision of Isaiah t...,heb0001
2,ogl0001.ogl001.1st1K-grc1.xml,Pinytus,De Epistola Pinyti ad Dionysium; Reliquiae sacrae,"FRAGMENTUM BEATI PINYTI, CNOSSI IN CRETA EPISC...",ogl0001
3,stoa0033a.tlg028.1st1K-grc1.xml,pseudo-Aristotle,De mundo; Aristotelis Opera,ΠΕΡΙ ΚΟΣΜΟΥ ΠΡΟΣ ΑΛΕΞΑΝΔΡΟΝ. ΠΟΛΛΑΚΙΣ μὲν ἔμοι...,stoa003
4,stoa0033a.tlg043.1st1K-grc1.xml,pseudo-Aristotle,De spiritu; Aristotelis Opera,ΠΕΡΙ ΠΝΕΥΜΑΤΟΣ. ΤΙΣ ἡ τοῦ ἐμφύτου πνεύματος δι...,stoa003


# Unique documents

In [46]:
data_merged_raw["doc_id"] = data_merged_raw.apply(lambda row: row["filename"][:14], axis=1)

In [47]:
### how many unique documents we have?
len(data_merged_raw["doc_id"].unique().tolist())

1458

In [48]:
data_merged_raw.sort_values("filename", inplace=True)

In [49]:
### remove duplicates
data_merged_raw.drop_duplicates(subset=["doc_id"], keep="last", inplace=True)

In [50]:
author_ids_with_raw_dates_df

Unnamed: 0,author_id,raw_date
0,tlg0031,A.D. 1
1,tlg0041,A.D. 1
2,tlg0280,A.D. 1
3,tlg0283,A.D. 1
4,tlg0347,A.D. 1
...,...,...
1818,tlg4081,A.D. 5
1819,tlg2210,a. 1 B.C.?
1820,tlg2606,a. 1 B.C.?
1821,tlg4323,A.D. 1/3


In [51]:
# merge with author_ids_with_raw_dates_df
AGT = pd.merge(data_merged_raw, author_ids_with_raw_dates_df, on="author_id")

In [52]:
### "raw_date" might now be used as a key to get anything from dates_cleaned dictionary

In [53]:
### date average
AGT["date_avr"] = AGT.apply(lambda row: dates_cleaned[row["raw_date"]][2], axis=1)

In [54]:
### date probabilities
AGT["date_probs"] = AGT.apply(lambda row: dates_cleaned[row["raw_date"]][1], axis=1)

In [55]:
AGT.head(5)

Unnamed: 0,filename,author,title,string,author_id,doc_id,raw_date,date_avr,date_probs
0,tlg0001.tlg001.perseus-grc2.xml,Apollonius Rhodius,Argonautica; Argonautica,"ἀρχόμενος σέο, Φοῖβε, παλαιγενέων κλέα φωτῶν μ...",tlg0001,tlg0001.tlg001,3 B.C.,-2.5,{-2.5: 1}
1,tlg0003.tlg001.perseus-grc2.xml,Thucydides,The Peloponnesian War; Historiae in two volumes,Θουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶν ...,tlg0003,tlg0003.tlg001,5 B.C.,-4.5,{-4.5: 1}
2,tlg0004.tlg001.perseus-grc1.xml,Diogenes Laertius,Lives of Eminent Philosophers; Lives of Eminen...,ΒΙΩΝ ΚΑΙ ΓΝΩΜΩΝ ΤΩΝ ΕΝ ΦΙΛΟΣΟΦΙΑΙ ΕΥΔΟΚΙΜΗΣΑΝΤ...,tlg0004,tlg0004.tlg001,A.D. 3,2.5,{2.5: 1}
3,tlg0005.tlg001.perseus-grc1.xml,Theocritus,Idylls ; Idylls,Θύρσις ἢ ᾠδή Θύρσις ̔Αδύ τι τὸ ψιθύρισμα καὶ ἁ...,tlg0005,tlg0005.tlg001,4-3 B.C.,-3.0,"{-3.5: 0.5, -2.5: 0.5}"
4,tlg0005.tlg002.perseus-grc1.xml,Theocritus,Epigrams ; Idylls,α παλ ϝι.336 τὰ ῥόδα τὰ δροσόεντα καὶ ἁ κατάπυ...,tlg0005,tlg0005.tlg002,4-3 B.C.,-3.0,"{-3.5: 0.5, -2.5: 0.5}"


# Add our own old manual coding of dating


In [56]:
works_overview = get_as_dataframe(AGT_overview.worksheet("works_overview_MANUAL_OLD"))
works_overview.head(5)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,pseudo-Aristotle,De mundo,stoa0033a.tlg028.1st1K-grc1,7303,3299,2313,"[187, 17.6417112299, 76, 11.6813747846]",pseudo-Aristotle,pagan,-2.5,
1,pseudo-Aristotle,De spiritu,stoa0033a.tlg043.1st1K-grc1,4002,1440,1083,"[193, 7.4611398964, 35, 5.1233280094]",pseudo-Aristotle,pagan,-3.5,
2,Eutropius,Breviarium historiae romanae,stoa0121.stoa001.opp-grc1,26143,13086,8941,"[960, 13.63125, 71, 8.5196746869]",Eutropius,pagan,3.5,
3,Hegemonius,Acta Archelai,stoa0146d.stoa001.opp-grc1,2725,1161,727,"[105, 11.0571428571, 38, 8.2736858504]",Hegemonius,christian,3.5,
4,Apollonius Rhodius,Argonautica,tlg0001.tlg001.perseus-grc2,45667,23930,15832,"[1660, 14.4156626506, 75, 8.6875895686]",Apollonius Rhodius,pagan,-2.5,


In [57]:
works_overview.columns = ["author", "work", "file", "words", "lemmata", "lem_filtered", "meta", "author_alternative", "provenience", "date_man", "notes"]

In [58]:
works_overview.head(5)

Unnamed: 0,author,work,file,words,lemmata,lem_filtered,meta,author_alternative,provenience,date_man,notes
0,pseudo-Aristotle,De mundo,stoa0033a.tlg028.1st1K-grc1,7303,3299,2313,"[187, 17.6417112299, 76, 11.6813747846]",pseudo-Aristotle,pagan,-2.5,
1,pseudo-Aristotle,De spiritu,stoa0033a.tlg043.1st1K-grc1,4002,1440,1083,"[193, 7.4611398964, 35, 5.1233280094]",pseudo-Aristotle,pagan,-3.5,
2,Eutropius,Breviarium historiae romanae,stoa0121.stoa001.opp-grc1,26143,13086,8941,"[960, 13.63125, 71, 8.5196746869]",Eutropius,pagan,3.5,
3,Hegemonius,Acta Archelai,stoa0146d.stoa001.opp-grc1,2725,1161,727,"[105, 11.0571428571, 38, 8.2736858504]",Hegemonius,christian,3.5,
4,Apollonius Rhodius,Argonautica,tlg0001.tlg001.perseus-grc2,45667,23930,15832,"[1660, 14.4156626506, 75, 8.6875895686]",Apollonius Rhodius,pagan,-2.5,


In [59]:
works_overview["author_id"] = works_overview.apply(lambda row: row["file"][:7], axis=1)

In [60]:
# check how indexing works
works_overview[works_overview["author_id"]=="tlg5034"]["date_man"].unique().tolist()

[-4.5]

In [61]:
# check biblical books
[value for value in works_overview[works_overview["author_id"]=="tlg0031"]["provenience"].unique().tolist() if not pd.isna(value)]

['christian']

In [62]:
### get a dict of authors with centuries and provenience values
author_cent_prov = {}
for author_id in list(set(works_overview["author_id"].tolist())):
    author_cent_prov[author_id] = ([value for value in works_overview[works_overview["author_id"]==author_id]["date_man"].unique().tolist() if not pd.isna(value)], [value for value in works_overview[works_overview["author_id"]==author_id]["provenience"].unique().tolist() if not pd.isna(value)])

In [63]:
author_cent_prov["tlg0018"]

([0.5], ['jewish'])

In [64]:
### manual update
author_cent_prov["tlg0526"] = ([0.5], ['jewish'])
author_cent_prov["tlg2040"] = ([3.5], ['christian'])
author_cent_prov["tlg1329"] = ([1.5], ['christian'])
author_cent_prov["tlg2035"] = ([3.5], ['christian'])
author_cent_prov['tlg0563'] = ([1.5], ['christian'])
author_cent_prov['tlg0013'] = ([-7.0], ['pagan'])
author_cent_prov['tlg2029'] = ([3.0], ['pagan'])
author_cent_prov['tlg0656'] = ([0.5], ['pagan'])
author_cent_prov['tlg0559'] = ([0.5], ['pagan'])
author_cent_prov['tlg0099'] = ([0], ['pagan'])

In [65]:
print(author_cent_prov)

{'tlg0544': ([1.5], ['pagan']), 'tlg9019': ([], []), 'tlg9006': ([], []), 'tlg4021': ([], []), 'tlg0548': ([1.5], ['pagan']), 'tlg1311': ([1.5], ['christian']), 'tlg0551': ([1.5], ['pagan']), 'tlg0082': ([1.5], ['pagan']), 'tlg0565': ([3.5], ['pagan']), 'tlg4027': ([], []), 'tlg0008': ([2.5], ['pagan']), 'tlg0031': ([0.5], ['christian']), 'tlg1622': ([1.5], ['christian']), 'tlg4013': ([], []), 'tlg0612': ([0.5], ['pagan']), 'tlg0554': ([0.5], ['pagan']), 'tlg1205': ([1.5], ['christian']), 'stoa003': ([-2.5, -3.5], ['pagan']), 'tlg0561': ([1.5], ['pagan']), 'tlg0012': ([-7.5], ['pagan']), 'tlg1484': ([1.5], ['christian']), 'tlg2036': ([], []), 'tlg3118': ([], []), 'tlg4016': ([], []), 'tlg1725': ([1.5], ['christian']), 'tlg4084': ([4.5], ['christian']), 'tlg0732': ([2.5], ['pagan']), 'tlg3135': ([], []), 'tlg5034': ([-4.5], ['pagan']), 'tlg2058': ([4.5], ['christian']), 'tlg0010': ([-3.5], ['pagan']), 'tlg0018': ([0.5], ['jewish']), 'tlg0006': ([-4.5], ['pagan']), 'tlg1252': ([], []), '

In [66]:
#sddk.write_file("SDAM_data/AGT/author_cent_prov.json", author_cent_prov, conf)

In [67]:
def return_cent(author_id):
  try:
    cent = float(author_cent_prov[author_id][0][0])
  except: 
    cent = None
  return cent

AGT["date_manual"] = AGT["author_id"].apply(return_cent) # lambda row: return_cent(row["author_id"]), axis=1)

In [68]:
def return_provenience(author_id):
  try:
    prov = author_cent_prov[author_id][1][0]
  except: 
    prov = ""
  return prov

AGT["provenience"] = AGT["author_id"].apply(return_provenience)

In [69]:
def get_wordcount(row_string):
  wordcount = len(row_string.split())
  return wordcount
AGT["wordcount"] = AGT["string"].apply(get_wordcount)

# Dating Homeric Hymns

In [70]:
### updated dates for the longest homeric hymns, based on:
### Faulkner, Andrew, ‘Introduction. Modern Scholarship on the Homeric Hymns: Foundational Issues’, in The Homeric Hymns: Interpretative Essays, ed. by Andrew Faulkner (Oxford: Oxford University Press, 2011), pp. 1–25

AGT.at[200, "author_id"] = "tlg0013dyon"
AGT.at[200, "date_avr"] = -6.5

AGT.at[201, "author_id"] = "tlg0013deme"
AGT.at[201, "date_avr"] = -6.0

AGT.at[202, "author_id"] = "tlg0013apol"
AGT.at[202, "date_avr"] = -6.0 ###

AGT.at[203, "author_id"] = "tlg0013herm" # "Ultimately, a late-sixth-century date for Hermes seems most attractive, but the Hymn could also belong to the ﬁrst half of the ﬁfth century."
AGT.at[203, "date_avr"] = -5.0 ###

AGT.at[204, "author_id"] = "tlg0013aphr"
AGT.at[204, "date_avr"] = -6.0 ###

AGT.at[206, "author_id"] = "tlg0013dyo2"
AGT.at[206, "date_avr"] = -5.0

AGT.at[218, "author_id"] = "tlg0013pan"
AGT.at[218, "date_avr"] = -5.0

for index_num in [200, 201, 202, 203, 204, 206, 218]:
  AGT.at[index_num, "author"] = "Homeric hymn"

### drop all remaining
AGT = AGT[AGT["author_id"] != "tlg0013"]

In [71]:
AGT[AGT["author_id"].str.startswith("tlg0013")]

Unnamed: 0,filename,author,title,string,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,wordcount
200,tlg0013.tlg001.perseus-grc2.xml,Homeric hymn,Hymn 1 To Dionysus; Machine readable text; The...,"Εἲς Διώνυσον οἳ μὲν γὰρ Δρακάνῳ σʼ, οἳ δʼ Ἰκάρ...",tlg0013dyon,tlg0013.tlg001,8-6 B.C.,-6.5,"{-7.5: 0.3333, -6.5: 0.3333, -5.5: 0.3333}",-7.0,pagan,146
201,tlg0013.tlg002.perseus-grc2.xml,Homeric hymn,Hymn 2 To Demeter; Machine readable text; The ...,"Εἲς Δημήτραν Δήμητρʼ ἠύκομον, σεμνὴν θεόν, ἄρχ...",tlg0013deme,tlg0013.tlg002,8-6 B.C.,-6.0,"{-7.5: 0.3333, -6.5: 0.3333, -5.5: 0.3333}",-7.0,pagan,3375
202,tlg0013.tlg003.perseus-grc2.xml,Homeric hymn,Hymn 3 To Apollo; Machine readable text; The H...,Εἲς Ἀπόλλωνα [Δήλιον] μνήσομαι οὐδὲ λάθωμαι Ἀπ...,tlg0013apol,tlg0013.tlg003,8-6 B.C.,-6.0,"{-7.5: 0.3333, -6.5: 0.3333, -5.5: 0.3333}",-7.0,pagan,3902
203,tlg0013.tlg004.perseus-grc2.xml,Homeric hymn,Hymn 4 To Hermes; Machine readable text; The H...,"Εἲς Ἑρμῆν Ἑρμῆν ὕμνει, Μοῦσα, Διὸς καὶ Μαιάδος...",tlg0013herm,tlg0013.tlg004,8-6 B.C.,-5.0,"{-7.5: 0.3333, -6.5: 0.3333, -5.5: 0.3333}",-7.0,pagan,4033
204,tlg0013.tlg005.perseus-grc2.xml,Homeric hymn,Hymn 5 To Aphrodite; Machine readable text; Th...,Εἲς Ἀφροδίτην μοῦσά μοι ἔννεπε ἔργα πολυχρύσου...,tlg0013aphr,tlg0013.tlg005,8-6 B.C.,-6.0,"{-7.5: 0.3333, -6.5: 0.3333, -5.5: 0.3333}",-7.0,pagan,2049
206,tlg0013.tlg007.perseus-grc2.xml,Homeric hymn,Hymn 7 To Dionysus; Machine readable text; The...,"Εἲς Διώνυσον ἀμφὶ Διώνυσον, Σεμέλης ἐρικυδέος ...",tlg0013dyo2,tlg0013.tlg007,8-6 B.C.,-5.0,"{-7.5: 0.3333, -6.5: 0.3333, -5.5: 0.3333}",-7.0,pagan,427
218,tlg0013.tlg019.perseus-grc2.xml,Homeric hymn,Hymn 19 to Pan; Machine readable text; The Hom...,"Εἲς Πᾶνα ἀμφί μοι Ἑρμείαο φίλον γόνον ἔννεπε, ...",tlg0013pan,tlg0013.tlg019,8-6 B.C.,-5.0,"{-7.5: 0.3333, -6.5: 0.3333, -5.5: 0.3333}",-7.0,pagan,338


# New Testament - split into individual authors to be approached separately

In [72]:
paul = [384, 385, 386, 387, 389, 391, 396]
john = [382, 401, 402, 403]
luke = [381, 383]

for work in paul:
  AGT.at[work, "author_id"] = "tlg0031paul"
  AGT.at[work, "author"] = "Paul of Tarsus"

for work in john:
  AGT.at[work, "author_id"] = "tlg0031john"
  AGT.at[work, "author"] = "Johnannine literature"

for work in luke:
  AGT.at[work, "author_id"] = "tlg0031luke"
  AGT.at[work, "author"] = "Luke (the evangelist)"

In [73]:
AGT[AGT["author_id"].str.startswith("tlg0031")]

Unnamed: 0,filename,author,title,string,author_id,doc_id,raw_date,date_avr,date_probs,date_manual,provenience,wordcount
379,tlg0031.tlg001.perseus-grc2.xml,,New Testament - Matthew; Machine readable text...,ΚΑΤΑ ΜΑΘΘΑΙΟΝ ΒΙΒΛΟΣ γενέσεως Ἰησοῦ Χριστοῦ υἱ...,tlg0031,tlg0031.tlg001,A.D. 1,0.5,{0.5: 1},0.5,christian,18289
380,tlg0031.tlg002.perseus-grc2.xml,,New Testament - Mark; Machine readable text; T...,ΚΑΤΑ ΜΑΡΚΟΝ ΑΡΧΗ τοῦ εὐαγγελίου Ἰησοῦ Χριστοῦ ...,tlg0031,tlg0031.tlg002,A.D. 1,0.5,{0.5: 1},0.5,christian,11277
381,tlg0031.tlg003.perseus-grc2.xml,Luke (the evangelist),New Testament - Luke; Machine readable text; T...,ΚΑΤΑ ΛΟΥΚΑΝ ΕΠΕΙΔΗΠΕΡ ΠΟΛΛΟΙ ἐπεχείρησαν ἀνατά...,tlg0031luke,tlg0031.tlg003,A.D. 1,0.5,{0.5: 1},0.5,christian,19460
382,tlg0031.tlg004.perseus-grc2.xml,Johnannine literature,New Testament - John; Machine readable text; T...,"ΚΑΤΑ ΙΩΑΝΗΝ ΕΝ ΑΡΧΗ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν...",tlg0031john,tlg0031.tlg004,A.D. 1,0.5,{0.5: 1},0.5,christian,15592
383,tlg0031.tlg005.perseus-grc2.xml,Luke (the evangelist),New Testament - Acts; Machine readable text; T...,πράξεις ἀποστόλων τὸν μὲν πρῶτον λόγον ἐποιησά...,tlg0031luke,tlg0031.tlg005,A.D. 1,0.5,{0.5: 1},0.5,christian,18408
384,tlg0031.tlg006.perseus-grc2.xml,Paul of Tarsus,New Testament - Romans; Machine readable text;...,"ΠΡΟΣ ΡΩΜΑΙΟΥΣ ΠΑΥΛΟΣ δοῦλος Ἰησοῦ Χριστοῦ, κλη...",tlg0031paul,tlg0031.tlg006,A.D. 1,0.5,{0.5: 1},0.5,christian,7109
385,tlg0031.tlg007.perseus-grc2.xml,Paul of Tarsus,New Testament - 1 Corinthians; Machine readabl...,ΠΡΟΣ ΚΟΡΙΝΘΙΟΥΣ Α ΠΑΥΛΟΣ κλητὸς ἀπόστολος Ἰησο...,tlg0031paul,tlg0031.tlg007,A.D. 1,0.5,{0.5: 1},0.5,christian,6817
386,tlg0031.tlg008.perseus-grc2.xml,Paul of Tarsus,New Testament - 2 Corinthians; Machine readabl...,ΠΡΟΣ ΚΟΡΙΝΘΙΟΥΣ Β ΠΑΥΛΟΣ ἀπόστολος Χριστοῦ Ἰησ...,tlg0031paul,tlg0031.tlg008,A.D. 1,0.5,{0.5: 1},0.5,christian,4473
387,tlg0031.tlg009.perseus-grc2.xml,Paul of Tarsus,New Testament - Galatians; Machine readable te...,"ΠΡΟΣ ΓΑΛΑΤΑΣ ΠΑΥΛΟΣ ἀπόστολος, οὐκ ἀπʼ ἀνθρώπω...",tlg0031paul,tlg0031.tlg009,A.D. 1,0.5,{0.5: 1},0.5,christian,2237
388,tlg0031.tlg010.perseus-grc2.xml,,New Testament - Ephesians; Machine readable te...,ΠΡΟΣ ΕΦΕΣΙΟΥΣ ΠΑΥΛΟΣ ἀπόστολος Χριστοῦ Ἰησοῦ δ...,tlg0031,tlg0031.tlg010,A.D. 1,0.5,{0.5: 1},0.5,christian,2423


In [74]:
import string
for letter, index in zip(list(string.ascii_lowercase)[:14], AGT[AGT["author_id"]=="tlg0031"].index.tolist()):
    AGT.at[index, "author_id"] = "tlg0031" + letter

In [91]:
# export the data for further usage...
sddk.write_file("SDAM_data/AGT/AGT_dated_20201020.json", AGT, conf)

Your <class 'pandas.core.frame.DataFrame'> object has been succefully written as "https://sciencedata.dk/files/SDAM_root/SDAM_data/AGT/AGT_20201020.json"
